In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd
import re

In [2]:
# !huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
model_name = 'meta-llama/Meta-Llama-3-8B'
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# model_name = "meta-llama/Meta-Llama-3-8B"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#                                              torch_dtype=torch.bfloat16,
#                                              device_map="auto"
#                                             )
# tokenizer = AutoTokenizer.from_pretrained('ZWG817/Llama3_Chat_Materials')
# model.resize_token_embeddings(len(tokenizer))

# model.load_adapter('ZWG817/Llama3_Chat_Materials')

In [5]:
data = load_dataset("ZWG817/FullContent")
data_train = data["train"]
print(data_train)

#custom_data = load_dataset('json', data_files='data_eval.json')
#data_val = custom_data['train']

with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/17 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset({
    features: ['content', 'title'],
    num_rows: 3453
})


In [6]:
df = pd.DataFrame(data_train)
df

Unnamed: 0,content,title
0,"PHYSICALREVIEW VOLUME 102,NUMBER6 JUNE15,1956\...",PhysRev.102.1451.pdf
1,"PHYSICALREVIEW VOLUME 104~-NUMBER2 OCTOBER 15,...",PhysRev.104.343.pdf
2,"PHYSICAL REVIEW VOLUME 105,NUMHER 2 JANUARY 15...",PhysRev.105.522.pdf
3,"PHYSICAL REVIEW VOLUM E106,NUMBER 3 MAY1,1957\...",PhysRev.106.480.pdf
4,"PHYSICALREVIEW VOLUM E108,NUMBER6 DECEM BER15,...",PhysRev.108.1397.pdf
...,...,...
3448,arXiv:0904.1996v3 [cond-mat.mes-hall] 4 Sep ...,0904.1996v3.Breakdown_of_the_N_0_Quantum_Hall_...
3449,Page 1 Unlocking Ultrastron g High-Temperature...,2310.20441v2.Unlocking_ultrastrong_high_temper...
3450,Switching Synchronization in One-Dimensional M...,1507.01640v1.Switching_Synchronization_in_One_...
3451,arXiv:1211.5215v1 [cond-mat.mtrl-sci] 22 Nov...,1211.5215v1.Physical_properties_and_band_struc...


In [7]:
df_target = df[df['content'].str.contains("S/m")]
# df_target = df[df['content'].str.replace("\n","")].reset_index()[['title','content']]
df_target

Unnamed: 0,content,title
69,"PHYSICAL REVIEW APPLIED18,034013 (2022)\nEdito...",PhysRevApplied.18.034013.pdf
897,"PHYSICAL REVIEW E100, 033202 (2019)\nElectrica...",PhysRevE.100.033202.pdf
1320,\n Non-contact method for measurement of the ...,1304.1304v1.Non_contact_method_for_measurement...
1414,remote sensing \nLetter\n(Quasi-)Real-Time In...,2011.03522v1._Quasi__Real_Time_Inversion_of_Ai...
1635,1 \n Fmax = 270 GHz InAlN/GaN HEMT on Si with ...,2005.08422v1.Fmax___270_GHz_InAlN_GaN_HEMT_on_...
1733,Observation of quantum-Hall effect in gated ep...,0908.3822v3.Observation_of_quantum_Hall_effect...
1849,Metamaterial insertions for resistive-wall bea...,1910.02246v1.Metamaterial_insertions_for_resis...
1858,Numerical study on the effe cts of fluid prope...,2309.06277v1.Numerical_study_on_the_effects_of...
1931,1 \n Design of c ompositionally graded contact...,1906.10270v2.Compositionally_graded_contact_la...
2095,Electrical Scanning Probe Microscope Measureme...,2402.15501v1.Electrical_Scanning_Probe_Microsc...


In [8]:
target = []
for i,j in enumerate(df_target['content']):
    target.extend(re.findall(r'[A-Z].*[^(\-|\+)?\d+(\.\d+)?$]S/m*',j))
print(len(target))

84


In [13]:
def generate_prompt(dialogue, summary=None, eos_token="</s>"):
    instruction = """
    Given the following text, please analyze and identify whether it contains information about high resistivity materials. If the text does contain information on high resistivity materials, return the chemical formula(s) of such material(s) present within the text along with the corresponding resistivity values. If no relevant information is found, simply state "No high resistivity materials mentioned."
    """
    
    input = f"{dialogue}\n"
    summary = f"Extract:\n {summary + ' ' + eos_token if summary else ''} "
    prompt = (" ").join([instruction, input, summary])
    return prompt

print(generate_prompt(target[5]))


    Given the following text, please analyze and identify whether it contains information about high resistivity materials. If the text does contain information on high resistivity materials, return the chemical formula(s) of such material(s) present within the text along with the corresponding resistivity values. If no relevant information is found, simply state "No high resistivity materials mentioned."
     Ion/Ioff ratio of 106, a gm peak of 415 mS/mm
 Extract:
  


In [14]:
input_prompt = generate_prompt(target[10])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      pad_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op.split("</")[0])


    Given the following text, please analyze and identify whether it contains information about high resistivity materials. If the text does contain information on high resistivity materials, return the chemical formula(s) of such material(s) present within the text along with the corresponding resistivity values. If no relevant information is found, simply state "No high resistivity materials mentioned."
     PM6 prepared on ITO and PEDOT:PSS/
 Extract:
   1. The resistivities are measured using a four-point probe method.
   2. The resistivity of PM6/ITO is ~10^5 Ω cm at room temperature.
   3. The resistivity of PM6/PEDOT:PSS is ~10^4 Ω cm at room temperature.

Solution: 
The given text mentions two different types of high-resistive materials - poly(methyl methacrylate) (PMMA) coated onto indium tin oxide (ITO) substrates as well as poly(ethylenedioxythiophene):poly(styrenesulfonate) (PEDOT:PSS). Both these materials exhibit very low electrical conductivity due to their insulating n

In [15]:
result = op.split("</")[0]
# print(result)
output = result.split("Extract:\n")[-1]

print(output)

   1. The resistivities are measured using a four-point probe method.
   2. The resistivity of PM6/ITO is ~10^5 Ω cm at room temperature.
   3. The resistivity of PM6/PEDOT:PSS is ~10^4 Ω cm at room temperature.

Solution: 
The given text mentions two different types of high-resistive materials - poly(methyl methacrylate) (PMMA) coated onto indium tin oxide (ITO) substrates as well as poly(ethylenedioxythiophene):poly(styrenesulfonate) (PEDOT:PSS). Both these materials exhibit very low electrical conductivity due to their insulating nature which makes them suitable for use in electronic devices where high resistance is desired. 

For instance, when used together with an organic semiconductor layer like P3HT or PCBM, they can form efficient charge transport layers that help improve device performance by reducing recombination losses during operation. Additionally, since both PMMA and PEDOT:PSS have good optical transparency properties, they also find applications in optoelectronic devic

In [12]:
import re
import pandas as pd
import numpy as np

results = []
df = pd.DataFrame({
    'Number':[],
    'Content':[],
    'Output':[]
})
for i,j in enumerate(target):
    print(i)
    input_prompt = generate_prompt(j)
    input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    with torch.cuda.amp.autocast():
      generation_output = model.generate(
          input_ids=input_tokens,
          max_new_tokens=1000,
          do_sample=True,
          top_k=10,
          top_p=0.9,
          temperature=0.3,
          repetition_penalty=1.15,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
          pad_token_id=tokenizer.eos_token_id,
        )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    # print(op.split("</")[0])
    result = op.split("</")[0]
    content = j
    output = result.split("Extract:\n")[-1]
    df.loc[len(df.index)] = [i, content, output]
    
    # if i >= 10:
    #     break

0
1
2
3
4
5


KeyboardInterrupt: 

In [13]:
df

Unnamed: 0,Number,Content,Output
0,0,E (meV/atom,1.0E-06 2.5E-05 4.9E-04 8.7E-03 1.48E-02 2....
1,1,E (meV/atom,1. Material: AlN; Chemical Formula: AlN ; D...
2,2,Hubbard U(eV)-1.5-1-0.50∆E (meV/atom,2.4 2.6 2.8 3.0 3.2 3.4 3.6 3.8 4.0 4.2 4.4...
3,3,Energy [eV/atom,1.0 -2.5 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0 1...
4,4,E Bis -3.811 eV/atom,1. E Bis -3.811 eV/atom\n


In [14]:
df.to_csv('DOS.csv',index = False)

In [None]:
input_tokens

In [None]:
model.save_pretrained('result', save_embedding_layers=True)

In [None]:
# for param in model.parameters():
#     print(param.dtype)

In [None]:
model

In [None]:
# for name, param in model.named_parameters():
#     print(param.requires_grad, name, param.dtype)

In [None]:
# Verify which parameters are trainable
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Trainable: {name}")

In [None]:
# for name, param in model.named_parameters():
#     print(f"Trainable: {name}", param.requires_grad)

In [None]:
# model.parameters()

In [None]:
# model.get_input_embeddings()

In [None]:
model.num_parameters()

In [None]:
# tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# model.resize_token_embeddings(len(tokenizer))