In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd
import re

In [2]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
model_name = 'meta-llama/Meta-Llama-3-8B'
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# model_name = "meta-llama/Meta-Llama-3-8B"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#                                              torch_dtype=torch.bfloat16,
#                                              device_map="auto"
#                                             )
# tokenizer = AutoTokenizer.from_pretrained('ZWG817/Llama3_Chat_Materials')
# model.resize_token_embeddings(len(tokenizer))

# model.load_adapter('ZWG817/Llama3_Chat_Materials')

In [5]:
data = load_dataset("ZWG817/FullContent")
data_train = data["train"]
print(data_train)

#custom_data = load_dataset('json', data_files='data_eval.json')
#data_val = custom_data['train']

with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

Dataset({
    features: ['title', 'content'],
    num_rows: 2365
})


In [6]:
df = pd.DataFrame(data_train)
df

Unnamed: 0,title,content
0,1512.03112v2.Hall_effect_and_Fermi_surface_rec...,arXiv:1512.03112v2 [cond-mat.supr-con] 9 Feb...
1,2009.14550v2.Out_of_plane_transport_of_1T_TaS2...,\n1 \n Out-of-plane transport of 1T -TaS 2/g...
2,1011.3934v1.Effect_of_electron_phonon_coupling...,Eect of electron-phonon coupling on transmiss...
3,2108.06226v2.Giant_nonlinear_response_due_to_u...,Giant nonlinear response due to unconventional...
4,2003.04501v1.Experimental_observations_indicat...,\nExperimental observations indicating the t...
...,...,...
2360,0904.2164v2.Chiral_asymmetry_of_the_Fermi_surf...,arXiv:0904.2164v2 [hep-ph] 15 Sep 2009Chiral...
2361,1509.07865v1.Composite_fermions_and_the_field_...,Composite fermions and the eld-tuned supercon...
2362,1007.2058v1.Modeling_of_complex_oxide_material...,arXiv:1007.2058v1 [cond-mat.str-el] 13 Jul 2...
2363,0804.0001v1.2D_skew_scattering_in_the_vicinity...,arXiv:0804.0001v1 [cond-mat.mes-hall] 31 Mar...


In [7]:
df_target = df[df['content'].str.contains("/atom")]
# df_target = df[df['content'].str.replace("\n","")].reset_index()[['title','content']]
df_target

Unnamed: 0,title,content
21,1210.1241v3.Electronic_structure_and_magnetism...,epl draft\nCorrelation eects and spin-orbit i...
31,1411.2781v1.A_multi_scale_approach_to_the_elec...,A multi-scale approach to the electronic struc...
82,1708.04323v1.Robust_Determination_of_the_Chemi...,arXiv:1708.04323v1 [physics.comp-ph] 14 Aug ...
83,1811.00776v1.First_principles_investigation_of...,"1 \n First -principles investigation of Ag -, ..."
229,2208.08898v1.Irida_Graphene__A_New_2D_Carbon_A...,Irida-Graphene: A New 2D Carbon Allotrope\nM. ...
...,...,...
2308,2001.02042v2.Diverse_fundamental_properties_in...,Diverse fundamental properties in stage- n gra...
2310,1703.04104v2.Electronic_origin_of_melting_T_P_...,\n1Electronic origin of melting T –P curves o...
2315,2005.08327v2.Remarkable_low_energy_properties_...,Remarkable low-energy properties of the pseudo...
2323,1503.08951v1.Room_Temperature_Quantum_Spin_Hal...,Room Temperature Quantum Spin Hall Insulators ...


In [8]:
target = []
for i,j in enumerate(df_target['content']):
    target.extend(re.findall(r'[A-Z].*[^(\-|\+)?\d+(\.\d+)?$]eV/atom*',j))
print(len(target))

98


In [9]:
def generate_prompt(dialogue, summary=None, eos_token="</s>"):
    instruction = """
    As a knowledgeable scientific assistant, your task is to analyze the text focusing on the Low Density of States (DOS) at the Fermi level for specific materials. The unit of Density of States (DOS) is "eV/atom" 
    Please provide the result as follows:
    if the text contains a material has Density of States (DOS), please tell me the material name, its chemical formula and its Density of States (DOS)
    if the text does not contain Density of States (DOS), please tell me "No finds"
    """
    
    input = f"{dialogue}\n"
    summary = f"Extract:\n {summary + ' ' + eos_token if summary else ''} "
    prompt = (" ").join([instruction, input, summary])
    return prompt

print(generate_prompt(target[5]))


    As a knowledgeable scientific assistant, your task is to analyze the text focusing on the Low Density of States (DOS) at the Fermi level for specific materials. The unit of Density of States (DOS) is "eV/atom" 
    Please provide the result as follows:
    if the text contains a material has Density of States (DOS), please tell me the material name, its chemical formula and its Density of States (DOS)
    if the text does not contain Density of States (DOS), please tell me "No finds"
     V/atom)47, arsenene (-2.99 eV/atom
 Extract:
  


In [10]:
input_prompt = generate_prompt(target[90])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      pad_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op.split("</")[0])

KeyboardInterrupt: 

In [None]:
result = op.split("</")[0]
# print(result)
output = result.split("Extract:\n")[-1]

print(output)

In [13]:
import re
import pandas as pd
import numpy as np

results = []
df = pd.DataFrame({
    'Number':[],
    'Content':[],
    'Output':[]
})
for i,j in enumerate(target):
    print(i)
    input_prompt = generate_prompt(j)
    input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    with torch.cuda.amp.autocast():
      generation_output = model.generate(
          input_ids=input_tokens,
          max_new_tokens=1000,
          do_sample=True,
          top_k=10,
          top_p=0.9,
          temperature=0.3,
          repetition_penalty=1.15,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
          pad_token_id=tokenizer.eos_token_id,
        )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    # print(op.split("</")[0])
    result = op.split("</")[0]
    content = j
    output = result.split("Extract:\n")[-1]
    df.loc[len(df.index)] = [i, content, output]
    
    # if i >= 10:
    #     break

0
1
2
3
4
5
6
7
8
9
10


In [14]:
df

Unnamed: 0,Number,Content,Output
0,0,E (meV/atom,1. Material Name: \n 2. Chemical Formula:...
1,1,E (meV/atom,1. The material's name.\n 2. Its chemical...
2,2,Hubbard U(eV)-1.5-1-0.50∆E (meV/atom,2.8 eV/atom\n 3.4 eV/atom\n 6.7 eV/atom...
3,3,Energy [eV/atom,1.0\n 2.0\n 3.0\n 4.0\n 5.0\n\n
4,4,E Bis -3.811 eV/atom,1. The density of states at the fermi energ...
5,5,"V/atom)47, arsenene (-2.99 eV/atom",1. Material: Arsenene\n 2. Chemical Formu...
6,6,"While the EFormis found to be −0.69 eV, the EC...",The density of states at the Fermi energy w...
7,7,La 2.58Te4 is 23.7 meV/atom,La 2.58Te4 is 23.7 meV/atom\n\n
8,8,DOS (states/eV/atom,1. The material name.\n 2. Its chemical f...
9,9,DOS (states/eV/atom,1. Material Name: \n 2. Chemical Formula:...


In [15]:
df.to_csv('DOS.csv',index = False)

In [16]:
input_tokens

tensor([[128000,    198,    262,   1666,    264,  42066,  12624,  18328,     11,
            701,   3465,    374,    311,  24564,    279,   1495,  21760,    389,
            279,  12310,  73710,    315,   4273,    320,     35,   3204,      8,
            520,    279,  99362,     72,   2237,    369,   3230,   7384,     13,
           5321,   3493,    279,   1121,    439,  11263,    512,    262,    422,
            279,   1495,   5727,    264,   3769,    706,  73710,    315,   4273,
            320,     35,   3204,    705,   4587,   3371,    757,    279,   3769,
            836,     11,   1202,  11742,  15150,    323,   1202,  73710,    315,
           4273,    320,     35,   3204,    340,    262,    422,    279,   1495,
           1587,    539,   6782,  73710,    315,   4273,    320,     35,   3204,
            705,   4587,   3371,    757,    330,   2822,  14035,    702,    262,
           6227,     11,    279,   5089,    315,  73710,    315,   4273,    320,
             35,   3204,    

In [None]:
model.save_pretrained('result', save_embedding_layers=True)

In [None]:
# for param in model.parameters():
#     print(param.dtype)

In [None]:
model

In [None]:
# for name, param in model.named_parameters():
#     print(param.requires_grad, name, param.dtype)

In [None]:
# Verify which parameters are trainable
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Trainable: {name}")

In [None]:
# for name, param in model.named_parameters():
#     print(f"Trainable: {name}", param.requires_grad)

In [None]:
# model.parameters()

In [None]:
# model.get_input_embeddings()

In [None]:
model.num_parameters()

In [None]:
# tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# model.resize_token_embeddings(len(tokenizer))