In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [2]:
!huggingface-cli login --token hf_123

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
model_name = 'meta-llama/Meta-Llama-3-8B'
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
model_name = "meta-llama/Meta-Llama-3-8B"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('ZWG817/Llama3_Chat_Materials')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('ZWG817/Llama3_Chat_Materials')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
data = load_dataset("ZWG817/Abstract_Template")
data_train = data["train"]
print(data_train)

#custom_data = load_dataset('json', data_files='data_eval.json')
#data_val = custom_data['train']

with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

Dataset({
    features: ['publicationDate', 'title', 'abstract', 'id'],
    num_rows: 8001
})


In [10]:
def generate_prompt(dialogue, summary=None, eos_token="</s>"):
  instruction = "In your role as a helpful scientific assistant, Read the following text and summarize the following. Please make sure the material's molecular formulas are involved:\n"
  input = f"{dialogue}\n"
  summary = f"Summary:\n {summary + ' ' + eos_token if summary else ''} "
  prompt = (" ").join([instruction, input, summary])
  return prompt

print(generate_prompt(data_train[4]['abstract']))

In your role as a helpful scientific assistant, Read the following text and summarize the following. Please make sure the material's molecular formulas are involved:
 Contact resistances between organic semiconductors and metals can dominate
the transport properties of electronic devices incorporating such materials. We
report measurements of the parasitic contact resistance and the true channel
resistance in bottom contact poly(3-hexylthiophene) (P3HT) field-effect
transistors with channel lengths from 400 nm up to 40 $\mu$m, from room
temperature down to 77 K. For fixed gate voltage, the ratio of contact to
channel resistance decreases with decreasing temperature. We compare this
result with a recent model for metal-organic semiconductor contacts. Mobilities
corrected for this contact resistance can approach 1 cm$^{2}$/Vs at room
temperature and high gate voltages.
 Summary:
  


In [12]:
input_prompt = generate_prompt(data_train[6]['abstract'])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
      pad_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op.split("</")[0])

In your role as a helpful scientific assistant, Read the following text and summarize the following. Please make sure the material's molecular formulas are involved:
 Phase change superlattice is one of the emerging material technologies for
ultralow-power phase change memories. However, the resistance switching
mechanism of phase change superlattice is still hotly debated. Early electrical
measurements and recent materials characterizations have suggested that the
Kooi phase is very likely to be the as-fabricated low-resistance state. Due to
the difficulty in in-situ characterization at atomic resolution, the structure
of the electrically switched superlattice in its high-resistance state is still
unknown and mainly investigated by theoretical modellings. So far, there has
been no simple model that can unify experimental results obtained from
device-level electrical measurements and atomic-level materials
characterizations. In this work, we carry out atomistic transport modellings of


In [17]:
result = op.split("</")[0]
# print(result)
content = data_train[6]['abstract']
output = result.split("Summary:\n")[-1]

print(output)

     1)   A new resistive switching mechanism is proposed for GeTe-based
phase-change superlattices (PCLs).
   2)   The interfacial phase transition between Kooi and rhombohedral phases
is responsible for the high-resistance state.
   3)   The PCL exhibits an intrinsic bistability with two different
resistances depending on whether or not the interface is in equilibrium.
   4)   The current-induced phase transition occurs via nucleation and growth
processes.
   5)   The critical size of the nucleus decreases exponentially with increasing
temperature.
   6)   The temperature-dependent activation energy barrier is consistent with
experimental observations. 


In [19]:
import re
import pandas as pd
import numpy as np

results = []
df = pd.DataFrame({
    'Number':[],
    'Content':[],
    'Output':[]
})
for i,j in enumerate(data_train):
    print(i)
    input_prompt = generate_prompt(data_train[i]['abstract'])
    input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    with torch.cuda.amp.autocast():
      generation_output = model.generate(
          input_ids=input_tokens,
          max_new_tokens=1000,
          do_sample=True,
          top_k=10,
          top_p=0.9,
          temperature=0.3,
          repetition_penalty=1.15,
          num_return_sequences=1,
          eos_token_id=tokenizer.eos_token_id,
          pad_token_id=tokenizer.eos_token_id,
        )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    # print(op.split("</")[0])
    result = op.split("</")[0]
    content = data_train[i]['abstract']
    output = result.split("Summary:\n")[-1]
    df.loc[len(df.index)] = [i, content, output]
    
    if i >= 10:
        break

0
1
2
3
4
5
6
7
8
9
10


In [20]:
df

Unnamed: 0,Number,Content,Output
0,0,The resistivity and magnetoresistance measurem...,...
1,1,The influence of electrical and thermal contac...,...
2,2,Controlled bipolar resistive switching (BRS) h...,...
3,3,Grain boundaries (GBs) in metals usually incre...,The resistance of copper grain boundaries d...
4,4,Contact resistances between organic semiconduc...,...
5,5,The resistivity of a superconductor in its nor...,1) A novel method is proposed to determine ...
6,6,Phase change superlattice is one of the emergi...,1) A new model is proposed to explain the...
7,7,We study magneto-transport properties in singl...,...
8,8,"Corrosion has a wide impact on society, causin...",1) High entropy alloys (HEAs) have been wid...
9,9,An increase in the quality factor of supercond...,1. A high-quality-factor (Q-value) SR...


In [None]:
df.to_csv('molecular formula.csv',index = False)

In [22]:
input_tokens

tensor([[128000,  45147,  31868,  65562,     60,   1134,  39031,   2511,  73789,
            439,    264,  11190,  18328,    449,    264,   5357,    389,  15374,
             11,   3477,    279,  72758,   1495,    311,   8417,    422,    433,
          15407,    279,  46092,  84614,   6926,    369,    264,   3230,   3769,
             13,  12540,   1778,   2038,    387,   3118,     11,   1160,   1193,
            279,   3769,   6532,  16662,   1202,  20081,  46092,  84614,   6926,
          16134,    524,  39031,  40171,   5783,  22974,    532,    423,   4341,
            278,   9451,  89492,   3893,   5364,  13915,   7911,  16628,    320,
             72,  32989,      8,    304,    958,  10546,   9473,    198,  23045,
          50185,    320,   5484,     34,      8,  52170,     14,   7489,   6535,
             90,    508,  32816,   6251,   6535,     90,   1399,  32816,     33,
           6535,     90,    508,     92,   8693,     16,     13,    717,    198,
          20211,   5738,  99

In [23]:
model.save_pretrained('result', save_embedding_layers=True)

In [24]:
# for param in model.parameters():
#     print(param.dtype)

torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bfloat16
torch.bflo

In [25]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(134933, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=4

In [26]:
# for name, param in model.named_parameters():
#     print(param.requires_grad, name, param.dtype)

In [27]:
# Verify which parameters are trainable
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Trainable: {name}")

In [28]:
# for name, param in model.named_parameters():
#     print(f"Trainable: {name}", param.requires_grad)

In [29]:
# model.parameters()

In [30]:
# model.get_input_embeddings()

In [31]:
model.num_parameters()

In [32]:
# tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# model.resize_token_embeddings(len(tokenizer))