In [1]:
import torch
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

In [3]:
# gilbert = pd.read_excel('gilbert_values.xlsx')
# gilbert['text'] = 'The gilbert damping constant of ' + gilbert['material_name'] + ' is '+ gilbert['gilbert_damping_constant'].apply(lambda x: str(x))
# gilbert

In [4]:
# resistivity = pd.read_excel('Resistivity_Mag_Drop_duplicates.xlsx')
# resistivity['text'] = 'The resistivity of ' + resistivity['Material_New'] + ' is '+ resistivity['Value'].apply(lambda x: str(x)) + 'Ohm.m'
# resistivity

In [5]:
# dataset = pd.DataFrame(columns = ['text'])
# dataset = pd.concat([dataset,resistivity['text']])
# dataset = pd.concat([dataset,gilbert['text']])
# dataset

In [6]:
# dataset.to_csv('value.csv',index = False)

In [7]:
dataset = load_dataset("ZWG817/Materials_Prompt")

In [8]:
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [9]:
def embed(batch):
    """
    adds a column to the dataset called 'embeddings'
    """
    # or you can combine multiple columns here
    # For example the title and the text
    information = batch["text"]
    return {"embeddings" : ST.encode(information)}

dataset = dataset.map(embed,batched=True,batch_size=16)


In [10]:
data = dataset["train"]
data = data.add_faiss_index("embeddings")

  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
def search(query: str, k: int = 3 ):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = ST.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( # retrieve results
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

In [12]:
scores , result = search("yig", 2 ) 
result['text']

['The gilbert damping constant of YIG is 7.35e-05',
 'The gilbert damping constant of YIG YIG is 0.008']

In [13]:
def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model = model.bfloat16()
    return model, tokenizer

In [14]:
# model_name = 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF'
model_name = 'meta-llama/Llama-3.1-8B-Instruct'
model, tokenizer = load_model(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [16]:
def format_prompt(prompt,retrieved_documents,k):
  """using the retrieved documents we will prompt the model to generate our responses"""
  PROMPT = f"Question:{prompt}\nContext:"
  for idx in range(k) :
    PROMPT+= f"{retrieved_documents['text'][idx]}\n"
  return PROMPT

def generate(formatted_prompt):
  formatted_prompt = formatted_prompt
  messages = [{"role":"system","content":SYS_PROMPT1},
              {"role":"system","content":SYS_PROMPT2},
              {"role":"user","content":formatted_prompt}]
  # tell the model to generate
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=[
          tokenizer.eos_token_id,
          tokenizer.convert_tokens_to_ids("<|eot_id|>")
      ],
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)

def rag_chatbot(prompt:str,k:int=2):
  scores , retrieved_documents = search(prompt, k)
  formatted_prompt = format_prompt(prompt,retrieved_documents,k)
  return generate(formatted_prompt)


In [17]:
SYS_PROMPT1 = """You are an assistant for answering questions.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, please make an inference based on the data given"""

SYS_PROMPT2 = """The data provide is the gilbert damping constant and material resistivity about materials. Usually, high resistivity would lead to low gilbert damping constant. Now Please Answer the question below:
"""

In [18]:
rag_chatbot("Please list the prediction for at least 5 new magnetic materials (completely new) with extremely low gilbert damping. Make sure provide its material formula", k = 50)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"Based on the provided data, we can infer that materials with high resistivity tend to have low Gilbert damping constants. With this in mind, let's propose five new magnetic materials with extremely low Gilbert damping constants:\n\n1. **Material Formula:** Y2Ir3Fe5O12\n**Gilbert Damping Constant:** 1e-06 (predicted to be extremely low due to high resistivity and similarity to Y3Fe5O12)\nYttrium (Y) is a good candidate for creating materials with low Gilbert damping constants, as seen in Y3Fe5O12. Adding Ir (Iridium) to the formula may further increase the resistivity and reduce the Gilbert damping constant.\n\n2. **Material Formula:** Mn2V3Fe5SiO12\n**Gilbert Damping Constant:** 5e-07 (predicted to be extremely low due to high resistivity and similarity to MnSi)\nManganese (Mn) is known for its high resistivity, and adding V (Vanadium) and Si (Silicon) may further increase the resistivity and reduce the Gilbert damping constant.\n\n3. **Material Formula:** Co2Ti3Fe5SiO12\n**Gilbert Da