In [1]:
import torch
import os
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, PeftModel
from trl import SFTTrainer

In [2]:
# os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

In [3]:
gilbert = pd.read_excel('gilbert_values.xlsx')
gilbert['text'] = 'The gilbert damping constant of ' + gilbert['material_name'] + ' is '+ gilbert['gilbert_damping_constant'].apply(lambda x: str(x))
gilbert

Unnamed: 0,material_name,gilbert_damping_constant,text
0,MnSi,0.000010,The gilbert damping constant of MnSi is 1e-05
1,Insulating ferromagnets,0.000010,The gilbert damping constant of Insulating fer...
2,NiMnSb,0.000010,The gilbert damping constant of NiMnSb is 1e-05
3,BYIG,0.000014,The gilbert damping constant of BYIG is 1.4e-05
4,V[TCNE]2,0.000038,The gilbert damping constant of V[TCNE]2 is 3....
...,...,...,...
286,BiCaSi-YIG,2.631023,The gilbert damping constant of BiCaSi-YIG is ...
287,GdGa-YIG,3.531022,The gilbert damping constant of GdGa-YIG is 3....
288,Y3Fe4GaO12,51.000000,The gilbert damping constant of Y3Fe4GaO12 is ...
289,NiFe2O4,710.000000,The gilbert damping constant of NiFe2O4 is 710.0


In [4]:
resistivity = pd.read_excel('Resistivity_Mag_Drop_duplicates.xlsx')
resistivity['text'] = 'The resistivity of ' + resistivity['Material_New'] + ' is '+ resistivity['Value'].apply(lambda x: str(x)) + 'Ohm.m'
resistivity

Unnamed: 0,Material_New,Value,text
0,ZnO-V2O5,1.190000e+04,The resistivity of ZnO-V2O5 is 11900.0Ohm.m
1,BiFe0.75Ti0.25O3,5.500000e+08,The resistivity of BiFe0.75Ti0.25O3 is 5500000...
2,P(VDF-TrFE),1.000000e+01,The resistivity of P(VDF-TrFE) is 10.0Ohm.m
3,Ni1−xZnxFe2O4,1.000000e+11,The resistivity of Ni1−xZnxFe2O4 is 1000000000...
4,NiO,1.000000e+11,The resistivity of NiO is 100000000000.0Ohm.m
...,...,...,...
801,Copper-Nickel (Cu-2%Ni) clad,1.350000e-08,The resistivity of Copper-Nickel (Cu-2%Ni) cla...
802,Co1-xFe0.4Ni0.6Si0.4B0.6,1.200000e-08,The resistivity of Co1-xFe0.4Ni0.6Si0.4B0.6 is...
803,Au/Pt/Ti/Ni,1.100000e-08,The resistivity of Au/Pt/Ti/Ni is 1.1e-08Ohm.m
804,Fe-Si,7.300000e+01,The resistivity of Fe-Si is 73.0Ohm.m


In [5]:
dataset = pd.DataFrame(columns = ['text'])
dataset = pd.concat([dataset,resistivity['text']])
dataset = pd.concat([dataset,gilbert['text']])
dataset

Unnamed: 0,text
0,The resistivity of ZnO-V2O5 is 11900.0Ohm.m
1,The resistivity of BiFe0.75Ti0.25O3 is 5500000...
2,The resistivity of P(VDF-TrFE) is 10.0Ohm.m
3,The resistivity of Ni1−xZnxFe2O4 is 1000000000...
4,The resistivity of NiO is 100000000000.0Ohm.m
...,...
286,The gilbert damping constant of BiCaSi-YIG is ...
287,The gilbert damping constant of GdGa-YIG is 3....
288,The gilbert damping constant of Y3Fe4GaO12 is ...
289,The gilbert damping constant of NiFe2O4 is 710.0


In [6]:
# dataset.to_csv('value.csv',index = False)

In [7]:
dataset = load_dataset("ZWG817/Materials_Prompt")

In [8]:
from sentence_transformers import SentenceTransformer
ST = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [9]:
def embed(batch):
    """
    adds a column to the dataset called 'embeddings'
    """
    # or you can combine multiple columns here
    # For example the title and the text
    information = batch["text"]
    return {"embeddings" : ST.encode(information)}

dataset = dataset.map(embed,batched=True,batch_size=16)


In [10]:
data = dataset["train"]
data = data.add_faiss_index("embeddings")

  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
def search(query: str, k: int = 3 ):
    """a function that embeds a new query and returns the most probable results"""
    embedded_query = ST.encode(query) # embed new query
    scores, retrieved_examples = data.get_nearest_examples( # retrieve results
        "embeddings", embedded_query, # compare our new embedded query with the dataset embeddings
        k=k # get only top k results
    )
    return scores, retrieved_examples

In [12]:
scores , result = search("yig", 4 ) 
result['text']

['The gilbert damping constant of YIG is 7.35e-05',
 'The gilbert damping constant of YIG YIG is 0.008',
 'The gilbert damping constant of YIG YIG with 6% Os is 0.092',
 'The gilbert damping constant of Ga, Sc - YIG is 5.1e-05']

In [13]:
def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    model = model.bfloat16()
    return model, tokenizer

In [14]:
# model_name = 'nvidia/Llama-3.1-Nemotron-70B-Instruct-HF'
model_name = 'meta-llama/Llama-3.1-8B-Instruct'
model, tokenizer = load_model(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [20]:
def format_prompt(prompt,retrieved_documents,k):
  """using the retrieved documents we will prompt the model to generate our responses"""
  PROMPT = f"Question:{prompt}\nContext:"
  for idx in range(k) :
    PROMPT+= f"{retrieved_documents['text'][idx]}\n"
  return PROMPT

def generate(formatted_prompt):
  formatted_prompt = formatted_prompt[:2000] # to avoid GPU OOM
  messages = [{"role":"system","content":SYS_PROMPT1},
              {"role":"system","content":SYS_PROMPT2},
              {"role":"user","content":formatted_prompt}]
  # tell the model to generate
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=[
          tokenizer.eos_token_id,
          tokenizer.convert_tokens_to_ids("<|eot_id|>")
      ],
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)

def rag_chatbot(prompt:str,k:int=2):
  scores , retrieved_documents = search(prompt, k)
  formatted_prompt = format_prompt(prompt,retrieved_documents,k)
  return generate(formatted_prompt)


In [17]:
SYS_PROMPT1 = """You are an assistant for answering questions.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, please make an inference based on the data given"""

SYS_PROMPT2 = """The data provide is the gilbert damping constant and material resistivity about materials. Usually, high resistivity would lead to low gilbert damping constant. Now Please Answer the question below:
"""

In [25]:
rag_chatbot("Please predict at least 5 new magnetic materials (completely new) with extremely low gilbert damping. Make sure provide its material formula", k = 10)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"Based on the provided data, I'll attempt to predict five new magnetic materials with extremely low Gilbert damping constants. Please note that these predictions are inferences based on the given data and might not be accurate in reality.\n\n1. **Material:** Yttrium-Indium-Iron (YInFe)\n**Formula:** Y0.8In0.2Fe\n**Gilbert Damping Constant:** 0.0008 (predicted to be lower than Insulating ferromagnets due to high resistivity)\nRationale: Yttrium and Indium have high resistivity, which is often associated with low Gilbert damping. The addition of Iron will provide ferromagnetic properties.\n\n2. **Material:** Lanthanum-Copper-Iron (LaCuFe)\n**Formula:** La0.7Cu0.3Fe\n**Gilbert Damping Constant:** 0.0005 (predicted to be lower than Fe0.5Co0.5 due to high resistivity and low damping)\nRationale: Lanthanum has a high resistivity, and Copper is known for its high resistivity as well. The addition of Iron will provide ferromagnetic properties.\n\n3. **Material:** Gadolinium-Manganese-Iron (GdM