In [1]:
import torch
import transformers
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import (
        get_peft_model, 
        prepare_model_for_kbit_training, 
        LoraConfig
    )
from trl import SFTTrainer

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                            load_in_8bit=True,
                                            device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (

In [3]:
from huggingface_hub import snapshot_download

snapshot_download(repo_id="enyuan/llama_2_7b_materials", cache_dir="cp")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

'cp/models--enyuan--llama_2_7b_materials/snapshots/429016e567bf6f618e2edd8f2c09ce68240fe57b'

In [4]:
from peft import PeftModel

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

peft_model_id = "cp/models--enyuan--llama_2_7b_materials/snapshots/429016e567bf6f618e2edd8f2c09ce68240fe57b"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

In [7]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the value:\n"
    input = f"Gilbert damping constant of {prompt}\n"
    output = f"Value: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('SiO_2')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op) 

Answer the value:
 Gilbert damping constant of SiO_2
 Value: 0.016


In [9]:
op[len(input_prompt):]

'0.016'

In [16]:
import pandas as pd

def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the value:\n"
    input = f"Gilbert damping constant of {prompt}\n"
    output = f"Value: "
    prompt = (" ").join([instruction, input, output])
    return prompt

def predict_value(material):
    input_prompt = generate_prompt(material)
    input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    with torch.cuda.amp.autocast():
        generation_output = peft_model.generate(
            input_ids=input_tokens,
            max_new_tokens=100,
            do_sample=True,
            top_k=10,
            top_p=0.9,
            temperature=0.3,
            repetition_penalty=1.15,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
          )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    #print(op)
    return op[len(input_prompt):]

df = pd.read_csv('predictions.csv')
generated_texts = []

for i in df['Materials']:
    generated_texts.append(predict_value(i))

df['Damping constant'] = generated_texts
df.to_csv('result2.csv', index=False)

In [13]:
df['Damping constant'] = generated_texts

In [15]:
df.to_csv('result.csv', index=False)

In [43]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
"""
predefined_tokens = [
    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc",
    "V", "Cr", "Fe", "Co", "Ge", "As", "Se", "Br", "Y", "Mo",
    "Ag", "In", "Te", "I", "La", "Pr", "Sm", "Er", "Lu", "W",
    "Re", "Os", "Bi", "Po", "At", "Fr", "Ac", "Th", "Pa", "U",
    "Am", "Es", "No", "Db", "Fl", "Mc"
]
"""
predefined_tokens = [
    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc",
    "V", "Cr", "Fe", "Co", "Ge", "As", "Se", "Br", "Y", "Mo",
    "Ag", "In", "Te", "I", "La", "Pr", "Sm", "Er", "Lu", "W",
    "Re", "Os", "Bi", "Po", "At", "Fr", "Ac", "Th", "Pa", "U",
    "Am", "Es", "No", "Db", "Fl", "Mc", 
    "A", "D", "E", "F", "G", "J", "L", "M", "Q", "R", "T", "X", "Z"
]

# 将预定义的 token 列表转换为对应的 token IDs
token_ids = tokenizer.convert_tokens_to_ids(predefined_tokens)

# 生成下一个 token 的概率分布
with torch.no_grad():
    input_ids = torch.tensor([token_ids])
    logits = peft_model(input_ids).logits

# 打印每个元素及其对应的概率
for element, token_id in zip(predefined_tokens, token_ids):
    token_index = input_ids[0].tolist().index(token_id)
    probability = torch.softmax(logits[0, token_index], dim=-1)[token_id].item()
    print(f"Probability of {element}: {probability:.4f}")

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
Probability of H: 0.0000
Probability of He: 0.0024
Probability of Li: 0.0055
Probability of Be: 0.0007
Probability of B: 0.0002
Probability of C: 0.0000
Probability of N: 0.0003
Probability of O: 0.0236
Probability of F: 0.0000
Probability of Ne: 0.0095
Probability of Na: 0.0004
Probability of Al: 0.0000
Probability of Si: 0.0281
Probability of P: 0.0002
Probability of S: 0.0000
Probability of Cl: 0.0000
Probability of Ar: 0.0000
Probability of K: 0.0034
Probability of Ca: 0.0019
Probability of Sc: 0.0000
Probability of V: 0.0614
Probability of Cr: 0.0000
Probability of Fe: 0.0000
Probability of Co: 0.0001
Probability of Ge: 0.0012
Probability of As: 0.0003
Probability of Se: 0.0002
Probability of Br: 0.0002
Probability of Y: 0.0002
Probability of Mo: 0.0001
Probability of Ag: 0.0003
Probability of In: 0.0018
Probability of Te: 0.0001
Probability of I: 0.0000
Probability of La: 

In [16]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: Y_3"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = peft_model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 10
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: Y_3
Word: Fe, Probability: 0.2266
Word: Al, Probability: 0.2079
Word: N, Probability: 0.0605
Word: (, Probability: 0.0568
Word: B, Probability: 0.0490
Word: Co, Probability: 0.0436
Word: C, Probability: 0.0295
Word: Cr, Probability: 0.0206
Word: M, Probability: 0.0198
Word: Bi, Probability: 0.0177


In [24]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant in low temperture.')

def next_step(elements, results):
    # 存储新的列表和特殊字符的元素列表
    new_elements = []
    
    # 特殊字符列表
    special_chars = {'.', '</s>', ' ', 'The', '\n'}
    
    for element in elements:
        # 添加一个空格，然后编码为输入张量
        input_text = input_prompt + element
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
        # 生成下一个字符的概率分布
        with torch.no_grad():
            logits = peft_model(input_ids).logits
    
        # 计算下一个字符的概率
        next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)
    
        # 寻找概率大于0.1且概率排在前三的字符
        top_chars = torch.topk(next_char_prob, k=2)
        for char_id, prob in zip(top_chars.indices, top_chars.values):
            if prob > 0.3:
                next_char = tokenizer.decode([char_id])
                if next_char not in special_chars:
                    new_element = element + next_char
                    new_elements.append(new_element)
                else:
                    results.append(element)
    
    # 打印新的列表
    print("New Elements:", new_elements)
    
    # 打印特殊字符对应的元素列表
    print("Results:", results)
    return new_elements, results

results = []
elements = [
    'Li_2O', 'Li_3N', 'LiNi_', 'LiNbO', 'SiO_2', 'Si_2O', 'Si_12', 'Si_3O', 'Si_3N', 'SiC_2', 'SiC_x', 'SiC_8', 'K_2C', 'K_3C', 
    'K_3Fe', 'KC_2', 'KC_8', 'KC_4', 'Ca_2M', 'Ca_2Co', 'Ca_3Co', 'Ca_12', 'Ca_10', 'V_2O', 'V_4O', 'V_4Co', 'V_12', 'V_10', 
    'In_2O', 'In_3S', 'In_4S', 'WC_1', 'WO_1', 'WO_3', 'WO_2', 'W_2O', 'W_2C', 'W_12', 'W_10', 'W_3C', 'Fe_2O', 'Fe_2Si', 
    'Fe_3O', 'Fe_3C', 'Fe_12', 'Fe_10', 'Re_2O', 'Re_2Co', 'Re_3B', 'At_2O', 'At_2X', 'At_12', 'Ac_2O', 'U_2O', 'U_3P', 'U_12', 
    'U_10', 'Zn_2Cr', 'Zn_12', 'Zr_2O', 'Zr_6C', 'Zr_6O', 'Y_3'
]
results = []

for i in range(10):
    elements, results = next_step(elements, results)

New Elements: ['Li_2O_', 'Li_3N_', 'LiNbO_', 'Si_2O_', 'Si_3O_', 'Si_3N_', 'K_2Cu', 'K_3C_', 'KC_8H', 'KC_4H', 'Ca_2Mg', 'Ca_2Mn', 'Ca_3Co_', 'V_2O_', 'V_4O_', 'V_4Co_', 'V_100', 'In_2O_', 'In_3Sn', 'In_3Sb', 'In_4Sn', 'In_4Sb', 'WC_1-', 'WO_12', 'W_2O_', 'W_2C_', 'W_100', 'W_3C_', 'Fe_2O_', 'Fe_3O_', 'Fe_12N', 'Fe_100', 'Re_2O_', 'Re_2Co_', 'Re_3B_', 'At_2O_', 'At_2Xe', 'U_2O_', 'U_3P_', 'Zr_2O_', 'Zr_6Cu', 'Zr_6C_', 'Zr_6O_', 'Y_3Al']
Results: ['WO_3', 'Ac_2O', 'Ac_2O']
New Elements: ['Li_2O_2', 'Li_3N_2', 'LiNbO_3', 'Si_2O_3', 'Si_3O_1', 'Si_3N_4', 'K_2CuCl', 'K_3C_6', 'KC_8H_', 'KC_4H_', 'Ca_2Mg_', 'Ca_3Co_2', 'V_2O_5', 'V_2O_3', 'V_4O_1', 'In_2O_3', 'In_3SnS', 'In_3Sb_', 'In_4Sn_', 'In_4Sb_', 'WC_1-x', 'W_2C_1', 'W_3C_1', 'Fe_2O_3', 'Fe_3O_4', 'Fe_12N_', 'Re_2O_7', 'Re_2Co_1', 'At_2O_3', 'At_2Xe_', 'U_2O_7', 'U_3P_4', 'Zr_2O_3', 'Zr_6Cu_', 'Zr_6C_2', 'Zr_6C_1', 'Zr_6O_1', 'Y_3Al_']
Results: ['WO_3', 'Ac_2O', 'Ac_2O']
New Elements: ['Li_3N_2O', 'Si_3O_10', 'K_2CuCl_', 'K_3C_60', 'K

In [27]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant in low temperture.')

def next_step(elements, results):
    # 存储新的列表和特殊字符的元素列表
    new_elements = []
    
    # 特殊字符列表
    special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '['}
    
    for element in elements:
        # 添加一个空格，然后编码为输入张量
        input_text = input_prompt + element
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
    
        # 生成下一个字符的概率分布
        with torch.no_grad():
            logits = peft_model(input_ids).logits
    
        # 计算下一个字符的概率
        next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)
    
        # 寻找概率大于0.1且概率排在前三的字符
        top_chars = torch.topk(next_char_prob, k=2)
        for char_id, prob in zip(top_chars.indices, top_chars.values):
            if prob > 0.2:
                next_char = tokenizer.decode([char_id])
                if next_char not in special_chars:
                    new_element = element + next_char
                    new_elements.append(new_element)
                else:
                    results.append(element)
    
    # 打印新的列表
    print("New Elements:", new_elements)
    
    # 打印特殊字符对应的元素列表
    print("Results:", results)
    return new_elements, results

results = []
elements = [
    'Li_', 'LiN', 'SiO', 'Si_', 'SiC', 'K_', 'KC', 'Ca_', 'V_', 'VO', 'In_', 'WC', 'WO', 'W_', 'Fe_', 'Re_', 'At_', 'Y_3', 'U_', 'Zn_', 'Zr_'
]
results = []

for i in range(15):
    elements, results = next_step(elements, results)

New Elements: ['Li_2', 'Li_3', 'LiNi', 'SiO_', 'Si_1', 'Si_2', 'SiC_', 'K_2', 'K_3', 'KC_', 'KCu', 'Ca_2', 'Ca_3', 'V_2', 'In_2', 'WC_', 'WO_', 'W_2', 'W_1', 'Fe_3', 'Fe_2', 'Re_2', 'At_2', 'Y_3Al', 'Y_3Fe', 'U_2', 'Zn_2', 'Zr_2']
Results: ['VO']
New Elements: ['LiNi_', 'SiO_2', 'SiO_4', 'Si_12', 'Si_2O', 'SiC_x', 'K_3C', 'KC_2', 'Ca_3Co', 'V_2O', 'In_2O', 'WC_1', 'WO_3', 'WO_1', 'W_12', 'Fe_3O', 'At_2O', 'Y_3Al_', 'Y_3Fe_']
Results: ['VO', 'KCu', 'KCu']
New Elements: ['LiNi_0', 'LiNi_2', 'Si_12B', 'Si_12O', 'Si_2O_', 'K_3C_', 'KC_2F', 'Ca_3Co_', 'V_2O_', 'In_2O_', 'WC_1-', 'WO_12', 'W_12C', 'Fe_3O_', 'At_2O_', 'Y_3Al_5', 'Y_3Fe_5']
Results: ['VO', 'KCu', 'KCu', 'SiO_2', 'SiC_x', 'WO_3']
New Elements: ['LiNi_2Co', 'Si_12B_', 'Si_12O_', 'Si_2O_3', 'Si_2O_5', 'K_3C_6', 'KC_2F_', 'Ca_3Co_2', 'V_2O_5', 'V_2O_3', 'In_2O_3', 'WC_1-x', 'W_12C_', 'Fe_3O_4', 'At_2O_3', 'Y_3Al_5O', 'Y_3Fe_5O']
Results: ['VO', 'KCu', 'KCu', 'SiO_2', 'SiC_x', 'WO_3', 'LiNi_0', 'LiNi_2', 'WO_12']
New Elements: ['Li

In [8]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    "Li", "Si", "K", "Ca", "V", "In", "W", "Fe", "Re", "At", "Ac", "Th", "U", "No", "Zn", "Zr"
]

# 存储新的列表和特殊字符的元素列表
new_elements = []
results = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
New Elements: ['Li_', 'LiN', 'SiO', 'Si_', 'SiC', 'K_', 'KC', 'Ca_', 'V_', 'VO', 'V(', 'In_', 'WC', 'WO', 'W_', 'Fe_', 'Re_', 'Atleast', 'Atom', 'At_', 'Acet', 'Ac_', 'U(', 'UO', 'U_', 'Noinformation', 'Nodata', 'Zn_', 'Zn(', 'Zr_', 'Zr(']
Results: []


In [9]:
elements = [
    'Li_', 'LiN', 'SiO', 'Si_', 'SiC', 'K_', 'KC', 'Ca_', 'V_', 'VO', 'In_', 'WC', 'WO', 'W_', 'Fe_', 'Re_', 'At_', 'Ac_', 'U_', 'Zn_', 'Zr_'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []
results = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_2', 'Li_3', 'LiNi', 'LiNb', 'LiN(', 'SiO_', 'Si_2', 'Si_1', 'Si_3', 'SiC_', 'K_2', 'K_3', 'KC_', 'KCu', 'Ca_2', 'Ca_3', 'Ca_1', 'V_2', 'V_4', 'V_1', 'VO(', 'In_2', 'In_3', 'In_4', 'WC_', 'WO_', 'W_2', 'W_1', 'W_3', 'Fe_2', 'Fe_3', 'Fe_1', 'Re_2', 'Re_3', 'At_2', 'At_1', 'Ac_2', 'U_2', 'U_3', 'U_1', 'Zn_2', 'Zn_1', 'Zr_2', 'Zr_6']
Results: ['SiO', 'SiC']


In [10]:
elements = [
    'Li_2', 'Li_3', 'LiNi', 'LiNb', 'LiN(', 'SiO_', 'Si_2', 'Si_1', 'Si_3', 'SiC_', 'K_2', 'K_3', 'KC_', 'KCu', 'Ca_2', 'Ca_3', 'Ca_1', 'V_2', 'V_4', 'V_1', 'VO(', 'In_2', 'In_3', 'In_4', 'WC_', 'WO_', 'W_2', 'W_1', 'W_3', 'Fe_2', 'Fe_3', 'Fe_1', 'Re_2', 'Re_3', 'At_2', 'At_1', 'Ac_2', 'U_2', 'U_3', 'U_1', 'Zn_2', 'Zn_1', 'Zr_2', 'Zr_6'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_2O', 'Li_3N', 'LiNi_', 'LiNbO', 'LiN(CH', 'LiN(C', 'LiN(Si', 'SiO_2', 'Si_2O', 'Si_12', 'Si_3O', 'Si_3N', 'SiC_2', 'SiC_x', 'SiC_8', 'K_2C', 'K_3C', 'K_3[', 'K_3Fe', 'KC_2', 'KC_8', 'KC_4', 'KCu(', 'KCu[', 'Ca_2M', 'Ca_2Co', 'Ca_3(', 'Ca_3Co', 'Ca_12', 'Ca_10', 'V_2O', 'V_4O', 'V_4Co', 'V_12', 'V_10', 'VO(CO', 'VO(C', 'VO(OH', 'In_2O', 'In_3S', 'In_4(', 'In_4S', 'WC_1', 'WO_1', 'WO_3', 'WO_2', 'W_2O', 'W_2C', 'W_12', 'W_10', 'W_3C', 'Fe_2O', 'Fe_2(', 'Fe_2Si', 'Fe_3O', 'Fe_3C', 'Fe_12', 'Fe_10', 'Re_2O', 'Re_2Co', 'Re_3B', 'At_2O', 'At_2X', 'At_12', 'Ac_2O', 'U_2O', 'U_2(', 'U_3P', 'U_12', 'U_10', 'Zn_2Cr', 'Zn_12', 'Zr_2O', 'Zr_2(', 'Zr_6C', 'Zr_6O']
Results: ['SiO', 'SiC', 'Ca_2', 'Ca_1', 'V_1', 'At_1', 'Zn_2', 'Zn_1']


In [11]:
elements = [
    'Li_2O', 'Li_3N', 'LiNi_', 'LiNbO', 'SiO_2', 'Si_2O', 'Si_12', 'Si_3O', 'Si_3N', 'SiC_2', 'SiC_x', 'SiC_8', 'K_2C', 'K_3C', 
    'K_3Fe', 'KC_2', 'KC_8', 'KC_4', 'Ca_2M', 'Ca_2Co', 'Ca_3Co', 'Ca_12', 'Ca_10', 'V_2O', 'V_4O', 'V_4Co', 'V_12', 'V_10', 
    'In_2O', 'In_3S', 'In_4S', 'WC_1', 'WO_1', 'WO_3', 'WO_2', 'W_2O', 'W_2C', 'W_12', 'W_10', 'W_3C', 'Fe_2O', 'Fe_2Si', 
    'Fe_3O', 'Fe_3C', 'Fe_12', 'Fe_10', 'Re_2O', 'Re_2Co', 'Re_3B', 'At_2O', 'At_2X', 'At_12', 'Ac_2O', 'U_2O', 'U_3P', 'U_12', 
    'U_10', 'Zn_2Cr', 'Zn_12', 'Zr_2O', 'Zr_6C', 'Zr_6O'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '['}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_2O_', 'Li_2O-', 'Li_3N_', 'Li_3Ni', 'LiNi_0', 'LiNi_2', 'LiNi_1', 'LiNbO_', 'Si_2O_', 'Si_12O', 'Si_12B', 'Si_12C', 'Si_3O_', 'Si_3N_', 'SiC_2H', 'SiC_2O', 'SiC_xO', 'SiC_xH', 'SiC_8H', 'K_2Cu', 'K_2C_', 'K_3C_', 'K_3Cu', 'K_3FeCl', 'K_3Fe_', 'KC_2H', 'KC_2O', 'KC_2F', 'KC_8H', 'KC_4H', 'Ca_2Mg', 'Ca_2Mn', 'Ca_2CoAl', 'Ca_2Co_', 'Ca_2CoM', 'Ca_3Co_', 'Ca_10Al', 'V_2O_', 'V_4O_', 'V_4Co_', 'V_12O', 'V_12C', 'V_100', 'V_10O', 'In_2O_', 'In_3Sb', 'In_3Sn', 'In_4Sn', 'In_4Sb', 'In_4S_', 'WC_1-', 'WC_1−', 'WO_12', 'WO_10', 'W_2O_', 'W_2C_', 'W_12C', 'W_12O', 'W_100', 'W_3C_', 'Fe_2O_', 'Fe_2SiO', 'Fe_2Si_', 'Fe_3O_', 'Fe_3C_', 'Fe_12N', 'Fe_12C', 'Fe_100', 'Re_2O_', 'Re_2Co_', 'Re_3B_', 'At_2O_', 'At_2Xe', 'At_2X_', 'At_12B', 'At_12C', 'U_2O_', 'U_3P_', 'U_12C', 'Zn_2CrO', 'Zn_2CrS', 'Zn_2Cr_', 'Zn_12C', 'Zr_2O_', 'Zr_6Cu', 'Zr_6C_', 'Zr_6Cd', 'Zr_6O_']
Results: ['SiO', 'SiC', 'Ca_2', 'Ca_1', 'V_1', 'At_1', 'Zn_2', 'Zn_1', 'Li_2O', 'SiO_2', 'SiC_2', 'SiC_x', 'SiC_8', 'K_3

In [12]:
elements = [
    'Li_2O_', 'Li_2O-', 'Li_3N_', 'Li_3Ni', 'LiNi_0', 'LiNi_2', 'LiNi_1', 'LiNbO_', 'Si_2O_', 'Si_12O', 'Si_12B', 'Si_12C', 'Si_3O_', 'Si_3N_', 'SiC_2H', 'SiC_2O', 'SiC_xO', 'SiC_xH', 'SiC_8H', 'K_2Cu', 'K_2C_', 'K_3C_', 'K_3Cu', 'K_3FeCl', 'K_3Fe_', 'KC_2H', 'KC_2O', 'KC_2F', 'KC_8H', 'KC_4H', 'Ca_2Mg', 'Ca_2Mn', 'Ca_2CoAl', 'Ca_2Co_', 'Ca_2CoM', 'Ca_3Co_', 'Ca_10Al', 'V_2O_', 'V_4O_', 'V_4Co_', 'V_12O', 'V_12C', 'V_100', 'V_10O', 'In_2O_', 'In_3Sb', 'In_3Sn', 'In_4Sn', 'In_4Sb', 'In_4S_', 'WC_1-', 'WC_1−', 'WO_12', 'WO_10', 'W_2O_', 'W_2C_', 'W_12C', 'W_12O', 'W_100', 'W_3C_', 'Fe_2O_', 'Fe_2SiO', 'Fe_2Si_', 'Fe_3O_', 'Fe_3C_', 'Fe_12N', 'Fe_12C', 'Fe_100', 'Re_2O_', 'Re_2Co_', 'Re_3B_', 'At_2O_', 'At_2Xe', 'At_2X_', 'At_12B', 'At_12C', 'U_2O_', 'U_3P_', 'U_12C', 'Zn_2CrO', 'Zn_2CrS', 'Zn_2Cr_', 'Zn_12C', 'Zr_2O_', 'Zr_6Cu', 'Zr_6C_', 'Zr_6Cd', 'Zr_6O_'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '['}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_2O_2', 'Li_2O_3', 'Li_2O-N', 'Li_3N_2', 'Li_3N_4', 'Li_3N_3', 'Li_3Ni_', 'LiNi_2Co', 'LiNi_2M', 'LiNi_1-', 'LiNi_1−', 'LiNbO_3', 'Si_2O_3', 'Si_2O_5', 'Si_12O_', 'Si_12B_', 'Si_12C_', 'Si_3O_1', 'Si_3N_4', 'SiC_2H_', 'SiC_2O_', 'SiC_xO_', 'SiC_xH_', 'SiC_8H_', 'K_2CuCl', 'K_2C_2', 'K_2C_6', 'K_2C_4', 'K_3C_6', 'K_3C_2', 'K_3CuS', 'K_3FeCl_', 'K_3Fe_2', 'K_3Fe_5', 'K_3Fe_4', 'KC_2H_', 'KC_2O_', 'KC_2F_', 'KC_8H_', 'KC_4H_', 'Ca_2Mg_', 'Ca_2Mn_', 'Ca_2MnSi', 'Ca_2CoAl_', 'Ca_2Co_1', 'Ca_2Co_2', 'Ca_2Co_3', 'Ca_2CoMn', 'Ca_2CoMg', 'Ca_3Co_2', 'Ca_3Co_4', 'Ca_3Co_1', 'Ca_10Al_', 'V_2O_5', 'V_2O_3', 'V_4O_1', 'V_4O_9', 'V_4Co_1', 'V_4Co_2', 'V_4Co_3', 'V_12O_', 'V_12C_', 'V_10O_', 'In_2O_3', 'In_3Sb_', 'In_3SnS', 'In_3Sn_', 'In_4Sn_', 'In_4SnS', 'In_4Sb_', 'In_4S_3', 'In_4S_4', 'In_4S_8', 'WC_1-x', 'WC_1−x', 'WO_12O', 'WO_10O', 'WO_10^', 'W_2O_1', 'W_2O_7', 'W_2O_8', 'W_2C_1', 'W_2C_x', 'W_12C_', 'W_12O_', 'W_3C_1', 'W_3C_2', 'Fe_2O_3', 'Fe_2O_4', 'Fe_2SiO_', 'Fe_2Si_1', 

In [13]:
elements = [
    'Li_2O_2', 'Li_2O_3', 'Li_3N_2', 'Li_3N_4', 'Li_3N_3', 'Li_3Ni_', 'LiNi_2Co', 'LiNi_2M', 'LiNbO_3', 'Si_2O_3', 'Si_2O_5', 'Si_12O_', 
    'Si_12B_', 'Si_12C_', 'Si_3O_1', 'Si_3N_4', 'SiC_2H_', 'SiC_2O_', 'SiC_xO_', 'SiC_xH_', 'SiC_8H_', 'K_2CuCl', 'K_2C_2', 'K_2C_6', 
    'K_2C_4', 'K_3C_6', 'K_3C_2', 'K_3CuS', 'K_3FeCl_', 'K_3Fe_2', 'K_3Fe_5', 'K_3Fe_4', 'KC_2H_', 'KC_2O_', 'KC_2F_', 'KC_8H_', 'KC_4H_', 
    'Ca_2Mg_', 'Ca_2Mn_', 'Ca_2MnSi', 'Ca_2CoAl_', 'Ca_2Co_1', 'Ca_2Co_2', 'Ca_2Co_3', 'Ca_2CoMn', 'Ca_2CoMg', 'Ca_3Co_2', 'Ca_3Co_4', 
    'Ca_3Co_1', 'Ca_10Al_', 'V_2O_5', 'V_2O_3', 'V_4O_1', 'V_4O_9', 'V_4Co_1', 'V_4Co_2', 'V_4Co_3', 'V_12O_', 'V_12C_', 'V_10O_', 'In_2O_3', 
    'In_3Sb_', 'In_3SnS', 'In_3Sn_', 'In_4Sn_', 'In_4SnS', 'In_4Sb_', 'In_4S_3', 'In_4S_4', 'In_4S_8', 'WC_1-x', 'WC_1−x',  
    'W_2O_1', 'W_2O_7', 'W_2O_8', 'W_2C_1', 'W_2C_x', 'W_12C_', 'W_12O_', 'W_3C_1', 'W_3C_2', 'Fe_2O_3', 'Fe_2O_4', 'Fe_2SiO_', 
    'Fe_2Si_1', 'Fe_2Si_3', 'Fe_2Si_2', 'Fe_3O_4', 'Fe_3C_2', 'Fe_12N_', 'Fe_12C_', 'Re_2O_7', 'Re_2O_5', 'Re_2Co_1', 
    'Re_2Co_2', 'Re_3B_2', 'Re_3B_1', 'Re_3B_3', 'At_2O_3', 'At_2Xe_', 'At_2X_3', 'At_2X_2', 'At_2X_1', 'At_12B_', 'At_12C_', 'U_2O_7', 'U_2O_3', 
    'U_3P_2', 'U_3P_4', 'U_3P_5', 'U_12C_', 'Zn_2CrO_', 'Zn_2CrSb', 'Zn_2CrSn', 'Zn_2Cr_0', 'Zn_2Cr_1', 'Zn_12Cu', 'Zn_12Cd', 'Zr_2O_3', 'Zr_2O_2', 
    'Zr_2O_4', 'Zr_6Cu_', 'Zr_6C_2', 'Zr_6C_1', 'Zr_6Cd_', 'Zr_6O_1'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '[', '-', '−'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_3N_2O', 'Li_3N_4O', 'Li_3N_3O', 'Li_3Ni_2', 'Li_3Ni_4', 'Li_3Ni_1', 'LiNi_2CoM', 'LiNi_2Co_', 'LiNi_2Mn', 'LiNi_2Mg', 'Si_12O_1', 'Si_12B_1', 'Si_12B_6', 'Si_12C_6', 'Si_12C_1', 'Si_3O_10', 'SiC_2H_2', 'SiC_2H_5', 'SiC_2H_3', 'SiC_2O_2', 'SiC_2O_3', 'SiC_2O_4', 'SiC_xO_y', 'SiC_xH_2', 'SiC_xH_y', 'SiC_8H_1', 'SiC_8H_6', 'SiC_8H_8', 'K_2CuCl_', 'K_2C_2O', 'K_2C_2N', 'K_2C_60', 'K_2C_4H', 'K_2C_4N', 'K_2C_4O', 'K_3C_60', 'K_3C_2N', 'K_3C_2O', 'K_3CuSb', 'K_3CuSn', 'K_3FeCl_6', 'K_3FeCl_1', 'K_3FeCl_9', 'K_3Fe_2O', 'K_3Fe_5O', 'K_3Fe_4O', 'KC_2H_5', 'KC_2H_4', 'KC_2H_3', 'KC_2O_4', 'KC_2O_3', 'KC_2F_5', 'KC_2F_4', 'KC_8H_1', 'KC_8H_8', 'KC_4H_4', 'KC_4H_8', 'KC_4H_1', 'Ca_2Mg_2', 'Ca_2Mg_1', 'Ca_2Mg_3', 'Ca_2Mn_3', 'Ca_2Mn_2', 'Ca_2Mn_1', 'Ca_2MnSi_', 'Ca_2MnSiO', 'Ca_2CoAl_1', 'Ca_2CoAl_2', 'Ca_2CoAl_4', 'Ca_2Co_12', 'Ca_2Co_2O', 'Ca_2Co_2M', 'Ca_2Co_2Al', 'Ca_2Co_3O', 'Ca_2CoMnSi', 'Ca_2CoMnO', 'Ca_2CoMn_', 'Ca_2CoMgSi', 'Ca_2CoMgZ', 'Ca_2CoMg_', 'Ca_3Co_2O', 'Ca_3Co_

In [14]:
elements = [
    'Li_3N_2O', 'Li_3N_4O', 'Li_3N_3O', 'Li_3Ni_2', 'Li_3Ni_4', 'Li_3Ni_1', 'LiNi_2CoM', 'LiNi_2Co_', 'LiNi_2Mn', 'LiNi_2Mg', 'Si_12O_1', 'Si_12B_1', 'Si_12B_6', 'Si_12C_6', 'Si_12C_1', 'Si_3O_10', 'SiC_2H_2', 'SiC_2H_5', 'SiC_2H_3', 'SiC_2O_2', 'SiC_2O_3', 'SiC_2O_4', 'SiC_xO_y', 'SiC_xH_2', 'SiC_xH_y', 'SiC_8H_1', 'SiC_8H_6', 'SiC_8H_8', 'K_2CuCl_', 'K_2C_2O', 'K_2C_2N', 'K_2C_60', 'K_2C_4H', 'K_2C_4N', 'K_2C_4O', 'K_3C_60', 'K_3C_2N', 'K_3C_2O', 'K_3CuSb', 'K_3CuSn', 'K_3FeCl_6', 'K_3FeCl_1', 'K_3FeCl_9', 'K_3Fe_2O', 'K_3Fe_5O', 'K_3Fe_4O', 'KC_2H_5', 'KC_2H_4', 'KC_2H_3', 'KC_2O_4', 'KC_2O_3', 'KC_2F_5', 'KC_2F_4', 'KC_8H_1', 'KC_8H_8', 'KC_4H_4', 'KC_4H_8', 'KC_4H_1', 'Ca_2Mg_2', 'Ca_2Mg_1', 'Ca_2Mg_3', 'Ca_2Mn_3', 'Ca_2Mn_2', 'Ca_2Mn_1', 'Ca_2MnSi_', 'Ca_2MnSiO', 'Ca_2CoAl_1', 'Ca_2CoAl_2', 'Ca_2CoAl_4', 'Ca_2Co_12', 'Ca_2Co_2O', 'Ca_2Co_2M', 'Ca_2Co_2Al', 'Ca_2Co_3O', 'Ca_2CoMnSi', 'Ca_2CoMnO', 'Ca_2CoMn_', 'Ca_2CoMgSi', 'Ca_2CoMgZ', 'Ca_2CoMg_', 'Ca_3Co_2O', 'Ca_3Co_4O', 'Ca_3Co_12', 'Ca_10Al_1', 'Ca_10Al_9', 'Ca_10Al_2', 'V_4O_12', 'V_4O_10', 'V_4O_11', 'V_4Co_12', 'V_4Co_16', 'V_4Co_2Cr', 'V_4Co_2M', 'V_12O_1', 'V_12O_2', 'V_12C_1', 'V_12C_2', 'V_10O_1', 'V_10O_2', 'In_3Sb_2', 'In_3Sb_5', 'In_3Sb_4', 'In_3SnS_', 'In_3Sn_2', 'In_3Sn_4', 'In_3Sn_3', 'In_4Sn_4', 'In_4Sn_1', 'In_4Sn_3', 'In_4SnS_', 'In_4SnSb', 'In_4Sb_1', 'In_4Sb_4', 'In_4Sb_2', 'In_4S_3O', 'In_4S_4O', 'WC_1-xN', 'WC_1−xN', 'W_2O_12', 'W_2O_10', 'W_2O_7S', 'W_2O_8S', 'W_2C_18', 'W_2C_12', 'W_2C_xN', 'W_12C_1', 'W_12C_2', 'W_12O_4', 'W_12O_1', 'W_12O_2', 'W_3C_12', 'W_3C_18', 'W_3C_2N', 'Fe_2SiO_4', 'Fe_2Si_3O', 'Fe_2Si_3B', 'Fe_2Si_2O', 'Fe_3O_4@', 'Fe_3C_2O', 'Fe_3C_2T', 'Fe_12N_2', 'Fe_12N_1', 'Fe_12C_6', 'Fe_12C_1', 'Fe_12C_2', 'Re_2Co_17', 'Re_2Co_12', 'Re_2Co_10', 'Re_2Co_2Si', 'Re_2Co_2Al', 'Re_3B_12', 'Re_3B_10', 'At_2Xe_1', 'At_2Xe_3', 'At_2Xe_2', 'At_2X_3Y', 'At_2X_2Y', 'At_2X_12', 'At_2X_10', 'At_12B_1', 'At_12B_6', 'At_12C_1', 'At_12C_2', 'U_2O_3N', 'U_3P_2O', 'U_3P_4O', 'U_3P_5O', 'U_3P_5S', 'U_12C_1', 'U_12C_6', 'U_12C_2', 'Zn_2CrO_4', 'Zn_2CrSb_', 'Zn_2CrSbS', 'Zn_2CrSnS', 'Zn_12Cu_', 'Zn_12Cd_', 'Zr_2O_2Si', 'Zr_2O_2N', 'Zr_6Cu_3', 'Zr_6Cu_1', 'Zr_6Cu_2', 'Zr_6C_2H', 'Zr_6C_2O', 'Zr_6C_2N', 'Zr_6C_12', 'Zr_6Cd_1', 'Zr_6Cd_2', 'Zr_6Cd_3', 'Zr_6O_12', 'Zr_6O_13', 'Zr_6O_11'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '[', '-', '−'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_3N_2O_', 'Li_3N_4O_', 'Li_3N_3O_', 'Li_3Ni_2M', 'Li_3Ni_2Co', 'Li_3Ni_4Co', 'LiNi_2CoMn', 'LiNi_2Co_0', 'LiNi_2Co_1', 'LiNi_2Co_2', 'LiNi_2Mn_', 'LiNi_2Mg_', 'Si_12O_19', 'Si_12O_16', 'Si_12O_18', 'Si_12B_12', 'Si_12B_18', 'Si_12B_19', 'Si_12B_6O', 'Si_12C_6H', 'Si_12C_18', 'Si_12C_12', 'SiC_2H_2O', 'SiC_2H_2N', 'SiC_2H_5O', 'SiC_2H_3O', 'SiC_2H_3N', 'SiC_2O_2H', 'SiC_xO_yH', 'SiC_xH_yO', 'SiC_8H_10', 'SiC_8H_16', 'SiC_8H_18', 'SiC_8H_6O', 'SiC_8H_6N', 'SiC_8H_8O', 'SiC_8H_8N', 'K_2CuCl_4', 'K_2C_2O_', 'K_2C_2N_', 'K_2C_60N', 'K_2C_4H_', 'K_2C_4N_', 'K_2C_4O_', 'K_3C_60N', 'K_3C_2N_', 'K_3C_2O_', 'K_3CuSbO', 'K_3CuSb_', 'K_3CuSnO', 'K_3CuSnF', 'K_3CuSnBr', 'K_3FeCl_6Br', 'K_3FeCl_6N', 'K_3FeCl_12', 'K_3FeCl_10', 'K_3FeCl_9N', 'K_3FeCl_9O', 'K_3Fe_2O_', 'K_3Fe_5O_', 'K_3Fe_4O_', 'KC_2H_5N', 'KC_2H_4N', 'KC_2H_3N', 'KC_2H_3O', 'KC_2O_3N', 'KC_2F_5N', 'KC_2F_5O', 'KC_2F_4N', 'KC_2F_4O', 'KC_8H_10', 'KC_8H_12', 'KC_8H_11', 'KC_8H_8N', 'KC_8H_8O', 'KC_4H_4N', 'KC_4H_4O', 

In [15]:
elements = [
    'Li_3N_2O_', 'Li_3N_4O_', 'Li_3N_3O_', 'Li_3Ni_2M', 'Li_3Ni_2Co', 'Li_3Ni_4Co', 'LiNi_2CoMn', 'LiNi_2Co_0', 'LiNi_2Co_1', 'LiNi_2Co_2', 'LiNi_2Mn_', 
    'LiNi_2Mg_', 'Si_12O_19', 'Si_12O_16', 'Si_12O_18', 'Si_12B_12', 'Si_12B_18', 'Si_12B_19', 'Si_12B_6O', 'Si_12C_6H', 'Si_12C_18', 'Si_12C_12', 
    'SiC_2H_2O', 'SiC_2H_2N', 'SiC_2H_5O', 'SiC_2H_3O', 'SiC_2H_3N', 'SiC_2O_2H', 'SiC_xO_yH', 'SiC_xH_yO', 'SiC_8H_10', 'SiC_8H_16', 'SiC_8H_18', 
    'SiC_8H_6O', 'SiC_8H_6N', 'SiC_8H_8O', 'SiC_8H_8N', 'K_2CuCl_4', 'K_2C_2O_', 'K_2C_2N_', 'K_2C_60N', 'K_2C_4H_', 'K_2C_4N_', 'K_2C_4O_', 'K_3C_60N', 
    'K_3C_2N_', 'K_3C_2O_', 'K_3CuSbO', 'K_3CuSb_', 'K_3CuSnO', 'K_3CuSnF', 'K_3CuSnBr', 'K_3FeCl_6Br', 'K_3FeCl_6N', 'K_3FeCl_12', 'K_3FeCl_10', 
    'K_3FeCl_9N', 'K_3FeCl_9O', 'K_3Fe_2O_', 'K_3Fe_5O_', 'K_3Fe_4O_', 'KC_2H_5N', 'KC_2H_4N', 'KC_2H_3N', 'KC_2H_3O', 'KC_2O_3N', 'KC_2F_5N', 'KC_2F_5O', 
    'KC_2F_4N', 'KC_2F_4O', 'KC_8H_10', 'KC_8H_12', 'KC_8H_11', 'KC_8H_8N', 'KC_8H_8O', 'KC_4H_4N', 'KC_4H_4O', 'KC_4H_8N', 'KC_4H_8O', 'KC_4H_10', 
    'KC_4H_12', 'KC_4H_11', 'Ca_2Mg_2Si', 'Ca_2Mg_2Al', 'Ca_2Mg_3Si', 'Ca_2Mg_3Al', 'Ca_2Mn_3O', 'Ca_2Mn_2O', 'Ca_2MnSi_2', 'Ca_2MnSiO_', 'Ca_2CoAl_12', 
    'Ca_2CoAl_10', 'Ca_2CoAl_2O', 'Ca_2CoAl_2Si', 'Ca_2CoAl_4O', 'Ca_2Co_12O', 'Ca_2Co_12Al', 'Ca_2Co_2O_', 'Ca_2Co_2Mn', 'Ca_2Co_2Mg', 'Ca_2Co_2Al_', 
    'Ca_2Co_3O_', 'Ca_2CoMnSi_', 'Ca_2CoMnSiO', 'Ca_2CoMnO_', 'Ca_2CoMn_2', 'Ca_2CoMn_3', 'Ca_2CoMn_4', 'Ca_2CoMgSi_', 'Ca_2CoMgSiO', 'Ca_2CoMgZn', 
    'Ca_2CoMg_2', 'Ca_2CoMg_3', 'Ca_2CoMg_4', 'Ca_3Co_2O_', 'Ca_3Co_4O_', 'Ca_3Co_12O', 'Ca_10Al_10', 'Ca_10Al_12', 'Ca_10Al_9M', 'Ca_10Al_9Si', 
    'Ca_10Al_2Si', 'Ca_10Al_2O', 'Ca_10Al_20', 'V_4Co_12Cr', 'V_4Co_12Fe', 'V_4Co_16Cr', 'V_4Co_2Cr_', 'V_4Co_2Mn', 'V_12O_19', 'V_12O_18', 'V_12O_28', 
    'V_12C_18', 'V_12C_12', 'V_12C_2N', 'V_10O_16', 'V_10O_18', 'V_10O_17', 'V_10O_28', 'V_10O_20', 'V_10O_22', 'In_3Sb_2Te', 'In_3Sb_2O', 'In_3Sb_5Te', 
    'In_3Sb_5Se', 'In_3Sb_4O', 'In_3Sb_4Se', 'In_3Sb_4Te', 'In_3SnS_4', 'In_3Sn_2S', 'In_3Sn_2O', 'In_3Sn_4S', 'In_3Sn_4O', 'In_3Sn_3S', 'In_3Sn_3O', 
    'In_3Sn_3As', 'In_4Sn_4S', 'In_4Sn_4O', 'In_4Sn_12', 'In_4Sn_3S', 'In_4Sn_3O', 'In_4SnS_4', 'In_4SnSb_', 'In_4Sb_12', 'In_4Sb_4O', 'In_4Sb_4Te', 
    'In_4Sb_4Se', 'In_4Sb_2Te', 'In_4Sb_2O', 'In_4S_3O_', 'In_4S_4O_', 'WC_1-xNb', 'WC_1-xN_', 'WC_1−xNb', 'WC_1−xN_', 'W_2O_7S_', 'W_2O_8S_', 'W_2C_xN_', 
    'W_12C_18', 'W_12C_24', 'W_12C_20', 'W_12C_22', 'W_12O_40', 'W_12O_19', 'W_12O_18', 'W_12O_28', 'W_12O_20', 'W_12O_22', 'W_3C_12N', 'W_3C_12B', 
    'W_3C_18H', 'W_3C_18N', 'W_3C_2N_', 'Fe_2Si_3O_', 'Fe_2Si_3B_', 'Fe_2Si_2O_', 'Fe_3C_2O_', 'Fe_3C_2T_', 'Fe_3C_2Ti', 
    'Fe_12N_2O', 'Fe_12N_12', 'Fe_12N_18', 'Fe_12C_6N', 'Fe_12C_6S', 'Fe_12C_18', 'Fe_12C_12', 'Fe_12C_2N', 'Fe_12C_2Si', 'Re_2Co_17B', 'Re_2Co_17Al', 
    'Re_2Co_12B', 'Re_2Co_12Al', 'Re_2Co_10B', 'Re_2Co_10Al', 'Re_2Co_2Si_', 'Re_2Co_2Al_', 'Re_3B_12C', 'Re_3B_10Si', 'At_2Xe_12', 'At_2Xe_13', 
    'At_2Xe_10', 'At_2Xe_3Y', 'At_2Xe_2Y', 'At_2X_3Y_', 'At_2X_2Y_', 'At_2X_2YZ', 'At_2X_12Y', 'At_2X_100', 'At_2X_10Y', 'At_12B_12', 'At_12B_18', 
    'At_12B_60', 'At_12B_6O', 'At_12C_18', 'At_12C_12', 'At_12C_24', 'At_12C_20', 'U_2O_3N_', 'U_2O_3NH', 'U_3P_2O_', 'U_3P_4O_', 'U_3P_5O_', 'U_3P_5S_', 
    'U_3P_5Sb', 'U_12C_12', 'U_12C_18', 'U_12C_6N', 'U_12C_24', 'U_12C_22', 'Zn_2CrSb_8', 'Zn_2CrSb_1', 'Zn_2CrSb_2', 'Zn_2CrSbS_', 'Zn_2CrSnS_', 'Zn_12Cu_2', 
    'Zn_12Cu_3', 'Zn_12Cu_1', 'Zn_12Cd_1', 'Zn_12Cd_3', 'Zn_12Cd_2', 'Zr_2O_2SiO', 'Zr_2O_2Si_', 'Zr_2O_2N_', 'Zr_6Cu_3Si', 'Zr_6Cu_12', 'Zr_6Cu_2Si', 
    'Zr_6Cu_2N', 'Zr_6C_2H_', 'Zr_6C_2O_', 'Zr_6C_2N_', 'Zr_6C_12H', 'Zr_6C_12O', 'Zr_6C_12N', 'Zr_6Cd_12', 'Zr_6Cd_2N', 'Zr_6Cd_3N', 'Zr_6O_12N', 
    'Zr_6O_13N', 'Zr_6O_11N', 'Zr_6O_11Cl'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '[', '-', '−'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=3)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_3N_2O_4', 'Li_3N_2O_2', 'Li_3N_2O_3', 'Li_3N_4O_2', 'Li_3N_4O_3', 'Li_3N_4O_4', 'Li_3N_3O_1', 'Li_3N_3O_2', 'Li_3N_3O_4', 'Li_3Ni_2Mn', 'Li_3Ni_2Mg', 'Li_3Ni_2Co_', 'Li_3Ni_2CoM', 'Li_3Ni_4Co_', 'LiNi_2CoMn_', 'LiNi_2Co_2M', 'LiNi_2Co_2O', 'LiNi_2Mn_1', 'LiNi_2Mn_0', 'LiNi_2Mn_2', 'LiNi_2Mg_0', 'LiNi_2Mg_1', 'Si_12O_16N', 'Si_12B_12O', 'Si_12B_12H', 'Si_12B_18H', 'Si_12B_19H', 'Si_12B_19O', 'Si_12B_19N', 'Si_12B_6O_', 'Si_12C_6H_', 'Si_12C_18H', 'Si_12C_12O', 'Si_12C_12H', 'Si_12C_12N', 'SiC_2H_2O_', 'SiC_2H_2N_', 'SiC_2H_2NH', 'SiC_2H_5O_', 'SiC_2H_5OCH', 'SiC_2H_3O_', 'SiC_2H_3N_', 'SiC_2O_2H_', 'SiC_xO_yH_', 'SiC_xH_yO_', 'SiC_xH_yOz', 'SiC_8H_10O', 'SiC_8H_10N', 'SiC_8H_16N', 'SiC_8H_16O', 'SiC_8H_18N', 'SiC_8H_18O', 'SiC_8H_6O_', 'SiC_8H_6N_', 'SiC_8H_8O_', 'SiC_8H_8N_', 'K_2CuCl_4·', 'K_2C_2O_6', 'K_2C_2O_7', 'K_2C_2O_4', 'K_2C_2N_2', 'K_2C_2N_4', 'K_2C_2N_6', 'K_2C_60N_', 'K_2C_4H_2', 'K_2C_4H_3', 'K_2C_4H_4', 'K_2C_4N_2', 'K_2C_4N_4', 'K_2C_4N_6', 'K_2C_4O_1'

In [17]:
elements = [
    'Li_3N_2O_4', 'Li_3N_2O_2', 'Li_3N_2O_3', 'Li_3N_4O_2', 'Li_3N_4O_3', 'Li_3N_4O_4', 'Li_3N_3O_1', 'Li_3N_3O_2', 'Li_3N_3O_4', 'Li_3Ni_2Mn', 'Li_3Ni_2Mg', 'Li_3Ni_2Co_', 'Li_3Ni_2CoM', 'Li_3Ni_4Co_', 'LiNi_2CoMn_', 'LiNi_2Co_2M', 'LiNi_2Co_2O', 'LiNi_2Mn_1', 'LiNi_2Mn_0', 'LiNi_2Mn_2', 'LiNi_2Mg_0', 'LiNi_2Mg_1', 'Si_12O_16N', 'Si_12B_12O', 'Si_12B_12H', 'Si_12B_18H', 'Si_12B_19H', 'Si_12B_19O', 'Si_12B_19N', 'Si_12B_6O_', 'Si_12C_6H_', 'Si_12C_18H', 'Si_12C_12O', 'Si_12C_12H', 'Si_12C_12N', 'SiC_2H_2O_', 'SiC_2H_2N_', 'SiC_2H_2NH', 'SiC_2H_5O_', 'SiC_2H_5OCH', 'SiC_2H_3O_', 'SiC_2H_3N_', 'SiC_2O_2H_', 'SiC_xO_yH_', 'SiC_xH_yO_', 'SiC_xH_yOz', 'SiC_8H_10O', 'SiC_8H_10N', 'SiC_8H_16N', 'SiC_8H_16O', 'SiC_8H_18N', 'SiC_8H_18O', 'SiC_8H_6O_', 'SiC_8H_6N_', 'SiC_8H_8O_', 'SiC_8H_8N_', 'K_2CuCl_4·', 'K_2C_2O_6', 'K_2C_2O_7', 'K_2C_2O_4', 'K_2C_2N_2', 'K_2C_2N_4', 'K_2C_2N_6', 'K_2C_60N_', 'K_2C_4H_2', 'K_2C_4H_3', 'K_2C_4H_4', 'K_2C_4N_2', 'K_2C_4N_4', 'K_2C_4N_6', 'K_2C_4O_1', 'K_2C_4O_9', 'K_2C_4O_6', 'K_3C_60N_', 'K_3C_2N_4', 'K_3C_2N_2', 'K_3C_2N_6', 'K_3C_2O_6', 'K_3C_2O_4', 'K_3C_2O_7', 'K_3CuSbO_', 'K_3CuSb_2', 'K_3CuSb_4', 'K_3CuSb_3', 'K_3CuSnO_', 'K_3CuSnF_', 'K_3CuSnBr_', 'K_3FeCl_6Br_', 'K_3FeCl_6N_', 'K_3FeCl_12Br', 'K_3FeCl_12F', 'K_3FeCl_10O', 'K_3FeCl_10F', 'K_3FeCl_9N_', 'K_3FeCl_9O_', 'K_3Fe_2O_4', 'K_3Fe_5O_1', 'K_3Fe_4O_9', 'K_3Fe_4O_1', 'KC_2H_5NH', 'KC_2H_5N_', 'KC_2H_4N_', 'KC_2H_3N_', 'KC_2H_3O_', 'KC_2O_3N_', 'KC_2F_5N_', 'KC_2F_5O_', 'KC_2F_5OCl', 'KC_2F_4N_', 'KC_2F_4O_', 'KC_8H_10N', 'KC_8H_10O', 'KC_8H_12N', 'KC_8H_12O', 'KC_8H_11N', 'KC_8H_8N_', 'KC_8H_8O_', 'KC_4H_4N_', 'KC_4H_4O_', 'KC_4H_8N_', 'KC_4H_8O_', 'KC_4H_10N', 'KC_4H_10O', 'KC_4H_12N', 'KC_4H_12O', 'KC_4H_11N', 'Ca_2Mg_2Si_', 'Ca_2Mg_2Al_', 'Ca_2Mg_3Si_', 'Ca_2Mg_3Al_', 'Ca_2Mn_3O_', 'Ca_2Mn_2O_', 'Ca_2MnSi_2O', 'Ca_2MnSiO_5', 'Ca_2MnSiO_6', 'Ca_2MnSiO_4', 'Ca_2CoAl_12O', 'Ca_2CoAl_10O', 'Ca_2CoAl_2O_', 'Ca_2CoAl_2Si_', 'Ca_2CoAl_4O_', 'Ca_2Co_12O_', 'Ca_2Co_12Al_', 'Ca_2Co_2O_6', 'Ca_2Co_2O_4', 'Ca_2Co_2O_5', 'Ca_2Co_2Mn_', 'Ca_2Co_2Mg_', 'Ca_2Co_2Al_1', 'Ca_2Co_2Al_2', 'Ca_2Co_2Al_4', 'Ca_2Co_3O_6', 'Ca_2Co_3O_1', 'Ca_2CoMnSi_2', 'Ca_2CoMnSi_4', 'Ca_2CoMnSi_3', 'Ca_2CoMnSiO_', 'Ca_2CoMnO_6', 'Ca_2CoMnO_4', 'Ca_2CoMnO_5', 'Ca_2CoMn_2O', 'Ca_2CoMn_3O', 'Ca_2CoMn_4O', 'Ca_2CoMgSi_2', 'Ca_2CoMgSi_4', 'Ca_2CoMgSiO_', 'Ca_2CoMgZnT', 'Ca_2CoMg_2Al', 'Ca_2CoMg_2Si', 'Ca_2CoMg_3Si', 'Ca_2CoMg_3Al', 'Ca_2CoMg_4Al', 'Ca_3Co_2O_6', 'Ca_3Co_4O_1', 'Ca_3Co_4O_9', 'Ca_3Co_12O_', 'Ca_10Al_10T', 'Ca_10Al_10Fe', 'Ca_10Al_12O', 'Ca_10Al_9Mg', 'Ca_10Al_9Mn', 'Ca_10Al_9Si_', 'Ca_10Al_2Si_', 'Ca_10Al_2O_', 'Ca_10Al_20M', 'Ca_10Al_20T', 'Ca_10Al_20Si', 'V_4Co_12Cr_', 'V_4Co_12Fe_', 'V_4Co_16Cr_', 'V_4Co_2Cr_2', 'V_4Co_2Cr_1', 'V_4Co_2Cr_3', 'V_4Co_2Mn_', 'V_4Co_2MnS', 'V_12C_18N', 'V_12C_18H', 'V_12C_12N', 'V_12C_12O', 'V_12C_2N_', 'In_3Sb_2Te_', 'In_3Sb_2O_', 'In_3Sb_5Te_', 'In_3Sb_5Se_', 'In_3Sb_4O_', 'In_3Sb_4Se_', 'In_3Sb_4Te_', 'In_3Sn_2S_', 'In_3Sn_2Sb', 'In_3Sn_2O_', 'In_3Sn_4S_', 'In_3Sn_4Sb', 'In_3Sn_4O_', 'In_3Sn_3S_', 'In_3Sn_3Sb', 'In_3Sn_3O_', 'In_3Sn_3As_', 'In_4Sn_4S_', 'In_4Sn_4Sb', 'In_4Sn_4O_', 'In_4Sn_12S', 'In_4Sn_3Sb', 'In_4Sn_3S_', 'In_4Sn_3O_', 'In_4SnSb_4', 'In_4SnSb_2', 'In_4SnSb_1', 'In_4Sb_12Se', 'In_4Sb_12Te', 'In_4Sb_4O_', 'In_4Sb_4Te_', 'In_4Sb_4Se_', 'In_4Sb_2Te_', 'In_4Sb_2O_', 'In_4S_3O_1', 'In_4S_4O_1', 'In_4S_4O_6', 'In_4S_4O_2', 'WC_1-xNb_', 'WC_1-xN_x', 'WC_1−xNb_', 'WC_1−xN_x', 'W_2O_7S_2', 'W_2O_7S_3', 'W_2O_7S_4', 'W_2O_8S_2', 'W_2O_8S_4', 'W_2C_xN_2', 'W_2C_xN_1', 'W_2C_xN_4', 'W_12C_18N', 'W_12C_18H', 'W_12C_24N', 'W_12C_24H', 'W_12C_20N', 'W_12C_20H', 'W_12C_22N', 'W_12C_22S', 'W_3C_12N_', 'W_3C_12B_', 'W_3C_12BN', 'W_3C_18H_', 'W_3C_18N_', 'W_3C_2N_2', 'W_3C_2N_4', 'W_3C_2N_3', 'Fe_2Si_3O_1', 'Fe_2Si_3O_6', 'Fe_2Si_3O_8', 'Fe_2Si_3B_2', 'Fe_2Si_3B_1', 'Fe_2Si_2O_5', 'Fe_2Si_2O_6', 'Fe_2Si_2O_4', 'Fe_3C_2O_4', 'Fe_3C_2T_2', 'Fe_3C_2T_4', 'Fe_3C_2T_x', 'Fe_3C_2Ti_', 'Fe_12N_2O_', 'Fe_12N_12O', 'Fe_12N_18O', 'Fe_12N_18Co', 'Fe_12C_6N_', 'Fe_12C_6S_', 'Fe_12C_6Sb', 'Fe_12C_18N', 'Fe_12C_12N', 'Fe_12C_12O', 'Fe_12C_2N_', 'Fe_12C_2Si_', 'Re_2Co_17B_', 'Re_2Co_17Al_', 'Re_2Co_12B_', 'Re_2Co_12Al_', 'Re_2Co_10B_', 'Re_2Co_10Al_', 'Re_2Co_2Si_2', 'Re_2Co_2Si_1', 'Re_2Co_2Al_1', 'Re_2Co_2Al_2', 'Re_3B_12C_', 'Re_3B_10Si_', 'At_2Xe_12B', 'At_2Xe_12Y', 'At_2Xe_13B', 'At_2Xe_13Y', 'At_2Xe_10B', 'At_2Xe_10Y', 'At_2Xe_3Y_', 'At_2Xe_2Y_', 'At_2Xe_2YZ', 'At_2X_3Y_3', 'At_2X_3Y_4', 'At_2X_3Y_2', 'At_2X_2Y_2', 'At_2X_2Y_3', 'At_2X_2Y_4', 'At_2X_2YZ_', 'At_2X_12Y_', 'At_2X_100Y', 'At_2X_10Y_', 'At_12B_12C', 'At_12B_12H', 'At_12B_18H', 'At_12B_60Si', 'At_12B_60H', 'At_12B_6O_', 'At_12C_18H', 'At_12C_18O', 'At_12C_18N', 'At_12C_12O', 'At_12C_12N', 'At_12C_12H', 'At_12C_24H', 'At_12C_24N', 'At_12C_20H', 'At_12C_20N', 'U_2O_3N_2', 'U_2O_3N_4', 'U_2O_3NH_', 'U_3P_2O_1', 'U_3P_4O_1', 'U_3P_5O_1', 'U_3P_5O_2', 'U_3P_5S_1', 'U_3P_5S_2', 'U_3P_5Sb_', 'U_12C_12N', 'U_12C_12O', 'U_12C_18N', 'U_12C_6N_', 'U_12C_24N', 'U_12C_22N', 'Zn_2CrSb_8S', 'Zn_2CrSb_2S', 'Zn_2CrSbS_4', 'Zn_2CrSbS_6', 'Zn_2CrSnS_4', 'Zn_2CrSnS_6', 'Zn_2CrSnS_8', 'Zn_12Cu_2M', 'Zn_12Cu_2N', 'Zn_12Cu_3N', 'Zn_12Cu_3M', 'Zn_12Cu_12', 'Zn_12Cd_12', 'Zn_12Cd_1S', 'Zn_12Cd_3Co', 'Zn_12Cd_3N', 'Zn_12Cd_3S', 'Zn_12Cd_2N', 'Zn_12Cd_2M', 'Zn_12Cd_2S', 'Zr_2O_2SiO_', 'Zr_2O_2Si_2', 'Zr_2O_2Si_3', 'Zr_2O_2Si_1', 'Zr_2O_2N_2', 'Zr_2O_2N_4', 'Zr_6Cu_3Si_', 'Zr_6Cu_12N', 'Zr_6Cu_12Si', 'Zr_6Cu_2Si_', 'Zr_6Cu_2Ni', 'Zr_6C_2H_1', 'Zr_6C_2H_2', 'Zr_6C_2O_1', 'Zr_6C_2O_2', 'Zr_6C_2N_2', 'Zr_6C_2N_4', 'Zr_6C_2N_6', 'Zr_6C_12H_', 'Zr_6C_12O_', 'Zr_6C_12N_', 'Zr_6Cd_12N', 'Zr_6Cd_2Ni', 'Zr_6Cd_3Ni', 'Zr_6O_12N_', 'Zr_6O_13N_', 'Zr_6O_11N_', 'Zr_6O_11Cl_'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '[', '-', '−'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=2)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.2:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['Li_3N_4O_20', 'Li_3N_3O_12', 'Li_3N_3O_20', 'Li_3Ni_2Mn_', 'Li_3Ni_2Mg_', 'Li_3Ni_2Co_2', 'Li_3Ni_2CoMn', 'Li_3Ni_4Co_2', 'Li_3Ni_4Co_4', 'LiNi_2CoMn_2', 'LiNi_2CoMn_3', 'LiNi_2Co_2Mn', 'LiNi_2Co_2O_', 'LiNi_2Mn_2O', 'Si_12O_16N_', 'Si_12B_12O_', 'Si_12B_12H_', 'Si_12B_18H_', 'Si_12B_19H_', 'Si_12B_19O_', 'Si_12B_19N_', 'Si_12B_6O_1', 'Si_12B_6O_2', 'Si_12C_6H_1', 'Si_12C_18H_', 'Si_12C_12O_', 'Si_12C_12H_', 'Si_12C_12N_', 'SiC_2H_2N_2', 'SiC_2H_2NH_', 'SiC_2H_5O_9', 'SiC_2H_5OCH_', 'SiC_2H_3O_1', 'SiC_2H_3O_9', 'SiC_2H_3N_2', 'SiC_2O_2H_2', 'SiC_xO_yH_z', 'SiC_xH_yO_z', 'SiC_xH_yO_1', 'SiC_8H_10O_', 'SiC_8H_10N_', 'SiC_8H_16N_', 'SiC_8H_16O_', 'SiC_8H_18N_', 'SiC_8H_18O_', 'SiC_8H_6O_3', 'SiC_8H_6N_2', 'SiC_8H_8O_3', 'SiC_8H_8O_2', 'SiC_8H_8N_2', 'SiC_8H_8N_4', 'K_2C_2O_6N', 'K_2C_2O_4N', 'K_2C_2N_2O', 'K_2C_2N_4O', 'K_2C_2N_6O', 'K_2C_60N_2', 'K_2C_60N_4', 'K_2C_4H_2N', 'K_2C_4H_3N', 'K_2C_4H_3O', 'K_2C_4H_4N', 'K_2C_4H_4O', 'K_2C_4N_2O', 'K_2C_4N_4O', 'K_2C_4N_6O', '

In [20]:
elements = [
    'LiNi_2Co_2Mn_', 'LiNi_2Co_2O_1', 'LiNi_2Mn_2O_', 'Si_12B_12O_3', 'Si_12B_12H_1', 'Si_12B_6O_19', 'Si_12B_6O_22', 'Si_12C_12O_1', 'Si_12C_12H_1', 'SiC_2H_2N_2O', 'SiC_2H_2NH_2', 'SiC_2H_5OCH_3', 'SiC_2H_3N_2O', 'SiC_2O_2H_2O', 'SiC_8H_10N_2', 'SiC_8H_6O_3N', 'SiC_8H_6N_2O', 'SiC_8H_8O_3N', 'SiC_8H_8N_2O', 'SiC_8H_8N_4O', 'K_2C_2O_6N_', 'K_2C_2O_4N_', 'K_2C_2N_2O_', 'K_2C_2N_4O_', 'K_2C_2N_6O_', 'K_2C_4H_2N_', 'K_2C_4H_3N_', 'K_2C_4H_3O_', 'K_2C_4H_4N_', 'K_2C_4H_4O_', 'K_2C_4N_2O_', 'K_2C_4N_4O_', 'K_2C_4N_6O_', 'K_3C_60N_12', 'K_3C_2N_4O_', 'K_3C_2N_2O_', 'K_3C_2N_6O_', 'K_3C_2O_6N_', 'K_3C_2O_4N_', 'K_3CuSbO_12', 'K_3CuSb_2O_', 'K_3CuSb_4O_', 'K_3CuSb_3O_', 'K_3CuSnO_12', 'K_3CuSnBr_12', 'K_3FeCl_6N_6O', 'K_3FeCl_6N_2O', 'K_3Fe_2O_4N_', 'KC_2H_5NH_3', 'KC_2H_5N_2O', 'KC_2H_5N_3O', 'KC_2H_4N_2O', 'KC_2H_3N_2O', 'KC_2F_4N_2O', 'KC_8H_10N_2', 'KC_8H_12N_2', 'KC_8H_11N_2', 'KC_8H_8N_2O', 'KC_4H_4N_2O', 'KC_4H_8N_2O', 'KC_4H_8N_4O', 'KC_4H_10N_2', 'KC_4H_12N_2', 'KC_4H_11N_2', 'Ca_2Mg_2Si_2O', 'Ca_2Mn_3O_12', 'Ca_2MnSi_2O_1', 'Ca_2MnSiO_5O_', 'Ca_2CoAl_12O_2', 'Ca_2CoAl_10O_2', 'Ca_2CoAl_2Si_2O', 'Ca_2Co_2Al_12O', 'Ca_2Co_2Al_2O_', 'Ca_2Co_2Al_4O_', 'Ca_2CoMnSi_2O_', 'Ca_2CoMnSi_4O_', 'Ca_2CoMnSi_3O_', 'Ca_2CoMn_2O_1', 'Ca_2CoMn_3O_1', 'Ca_2CoMn_4O_1', 'Ca_2CoMgSi_2O_', 'Ca_2CoMgSi_4O_', 'Ca_10Al_10Ti_', 'Ca_10Al_12O_2', 'Ca_10Al_2O_19', 'Ca_10Al_20Mg_', 'Ca_10Al_20Mn_', 'Ca_10Al_20Ti_', 'V_4Co_2MnSn_', 'In_3Sb_2Te_12', 'In_3Sb_2O_12', 'In_3Sb_5Te_12', 'In_3Sb_4Te_12', 'In_4Sn_12Sb_', 'In_4Sn_3Sb_1', 'In_4Sb_12Se_2', 'In_4Sb_12Te_1', 'WC_1-xNb_xO', 'WC_1-xN_xO_', 'WC_1−xNb_xO', 'WC_1−xN_xO_', 'W_3C_12N_12', 'W_3C_12B_12', 'W_3C_2N_2O_', 'W_3C_2N_4O_', 'W_3C_2N_3O_', 'Fe_2Si_3B_2O_', 'Fe_2Si_2O_5N_', 'Fe_2Si_2O_4N_', 'Fe_12N_2O_19', 'Fe_12N_12O_4', 'Re_2Co_2Si_2O_', 'Re_2Co_2Al_2O_', 'Re_3B_12C_12', 'At_2Xe_12B_1', 'At_2Xe_12Yb_', 'At_2Xe_13Yb_', 'At_2Xe_10Yb_', 'At_2X_3Y_4Z_', 'At_2X_3Y_2Z_', 'At_2X_2Y_2Z_', 'At_2X_2Y_3Z_', 'At_2X_2Y_4Z_', 'At_2X_100Yb_', 'At_12B_12C_1', 'At_12B_12H_1', 'At_12C_12H_1', 'U_2O_3N_2O_', 'U_12C_12N_1', 'Zn_2CrSb_2Sn_', 'Zn_12Cu_2Mn_', 'Zn_12Cu_2Ni_', 'Zn_12Cu_3Ni_', 'Zn_12Cu_3Mn_', 'Zn_12Cd_1Sn_', 'Zn_12Cd_1Sb_', 'Zn_12Cd_3Ni_', 'Zn_12Cd_3Sn_', 'Zn_12Cd_3Sb_', 'Zn_12Cd_2Ni_', 'Zn_12Cd_2Mn_', 'Zn_12Cd_2Sn_', 'Zn_12Cd_2Sb_', 'Zr_2O_2Si_2O_', 'Zr_2O_2Si_3O_', 'Zr_2O_2N_2O_', 'Zr_2O_2N_4O_', 'Zr_6Cu_3Si_6O', 'Zr_6Cu_12Ni_', 'Zr_6Cu_12Si_1', 'Zr_6C_2H_2N_', 'Zr_6C_2H_2O_', 'Zr_6C_2N_2O_', 'Zr_6C_2N_4O_', 'Zr_6C_2N_6O_', 'Zr_6C_12N_6O', 'Zr_6C_12N_12', 'Zr_6Cd_12Ni_'
]

# 存储新的列表和特殊字符的元素列表
new_elements = []
results = []

# 特殊字符列表
special_chars = {'.', '</s>', ' ', 'The', '\n', '(', '[', '-', '−'}

for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to("cuda")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1且概率排在前三的字符
    top_chars = torch.topk(next_char_prob, k=2)
    for char_id, prob in zip(top_chars.indices, top_chars.values):
        if prob > 0.5:
            next_char = tokenizer.decode([char_id])
            if next_char not in special_chars:
                new_element = element + next_char
                new_elements.append(new_element)
            else:
                results.append(element)

# 打印新的列表
print("New Elements:", new_elements)

# 打印特殊字符对应的元素列表
print("Results:", results)

New Elements: ['LiNi_2Co_2O_12', 'LiNi_2Mn_2O_1', 'Si_12B_12O_36', 'Si_12B_12H_12', 'Si_12C_12O_12', 'Si_12C_12H_12', 'SiC_2H_3N_2O_', 'SiC_8H_10N_2O', 'SiC_8H_6O_3N_', 'SiC_8H_6N_2O_', 'SiC_8H_8O_3N_', 'SiC_8H_8N_2O_', 'SiC_8H_8N_4O_', 'K_2C_2N_2O_1', 'K_2C_2N_4O_1', 'K_2C_2N_6O_1', 'K_2C_4N_2O_1', 'K_2C_4N_4O_1', 'K_2C_4N_6O_1', 'K_3C_2N_4O_1', 'K_3C_2N_2O_1', 'K_3C_2N_6O_1', 'K_3CuSb_2O_1', 'K_3CuSb_4O_1', 'K_3CuSb_3O_1', 'K_3FeCl_6N_6O_', 'K_3FeCl_6N_2O_', 'KC_2H_5N_2O_', 'KC_2H_5N_3O_', 'KC_2H_4N_2O_', 'KC_2H_3N_2O_', 'KC_2F_4N_2O_', 'KC_8H_10N_2O', 'KC_8H_12N_2O', 'KC_8H_11N_2O', 'KC_8H_8N_2O_', 'KC_4H_4N_2O_', 'KC_4H_8N_2O_', 'KC_4H_8N_4O_', 'KC_4H_10N_2O', 'KC_4H_12N_2O', 'KC_4H_11N_2O', 'Ca_2Mg_2Si_2O_', 'Ca_2CoAl_12O_22', 'Ca_2CoAl_2Si_2O_', 'Ca_2Co_2Al_12O_', 'Ca_2Co_2Al_2O_1', 'Ca_2Co_2Al_4O_1', 'Ca_2CoMnSi_2O_1', 'Ca_2CoMnSi_4O_1', 'Ca_2CoMnSi_3O_1', 'Ca_2CoMn_2O_12', 'Ca_2CoMn_3O_12', 'Ca_2CoMgSi_2O_1', 'Ca_2CoMgSi_4O_1', 'Ca_10Al_12O_22', 'In_4Sn_3Sb_12', 'In_4Sb_12Se_22

In [53]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    "He", "Li", "O", "Si", "K", "Ca", "V", "In", "W", "Re", "At", "Ac", "Th", "U", "No", "Zn", "Zr"
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_', 'Li_', 'LiN', 'OCS', 'OC', 'O_', 'SiC', 'Si_', 'SiO', 'KC', 'K_', 'Ca_', 'V(', 'V_', 'VO', 'In_', 'WC', 'W_', 'WO', 'Re_', 'Atom', 'Atleast', 'At_', 'Acet', 'Ac_', 'U(', 'U_', 'UO', 'Nodata', 'Noinformation', 'Zn(', 'Zn_', 'Zr(', 'Zr_']


In [54]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'He_', 'Li_', 'LiN', 'OCS', 'OC', 'O_', 'SiC', 'Si_', 'SiO', 'KC', 'K_', 'Ca_', 'V(', 'V_', 'VO', 'In_', 'WC', 'W_', 'WO', 'Re_', 'At_', 'Ac_', 'U(', 'U_', 'UO', 'Zn(', 'Zn_', 'Zr(', 'Zr_'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_2', 'He_3', 'He_4', 'Li_2', 'Li_3', 'LiNi', 'LiNb', 'LiN(', 'OCSi', 'OCS.', 'OC(', 'OC_', 'O_2', 'O_3', 'SiC.', 'SiC_', 'Si_1', 'Si_2', 'Si_3', 'Si_4', 'SiO.', 'SiO_', 'KCu', 'KC_', 'K_2', 'K_3', 'Ca_1', 'Ca_2', 'Ca_3', 'Ca_5', 'V(CO', 'V(C', 'V_1', 'V_2', 'V_3', 'V_4', 'VO(', 'In_2', 'In_3', 'In_4', 'WC_', 'W_1', 'W_2', 'W_3', 'WO_', 'Re_2', 'Re_3', 'At_1', 'At_2', 'Ac_2', 'U_1', 'U_2', 'U_3', 'UO_', 'Zn(CH', 'Zn(C', 'Zn_1', 'Zn_2', 'Zr(CH', 'Zr(C', 'Zr_2', 'Zr_6']


In [55]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'He_2', 'He_3', 'He_4', 'Li_2', 'Li_3', 'LiNi', 'LiNb', 'LiN(', 'OCSi', 'OCS.', 'OC(', 'OC_', 'O_2', 'O_3', 
    'SiC.', 'SiC_', 'Si_1', 'Si_2', 'Si_3', 'Si_4', 'SiO.', 'SiO_', 'KCu', 'KC_', 'K_2', 'K_3', 
    'Ca_1', 'Ca_2', 'Ca_3', 'Ca_5', 'V(CO', 'V(C', 'V_1', 'V_2', 'V_3', 'V_4', 'VO(', 
    'In_2', 'In_3', 'In_4', 'WC_', 'W_1', 'W_2', 'W_3', 'WO_', 'Re_2', 'Re_3', 'At_1', 'At_2', 'Ac_2', 
    'U_1', 'U_2', 'U_3', 'UO_', 'Zn(CH', 'Zn(C', 'Zn_1', 'Zn_2', 'Zr(CH', 'Zr(C', 'Zr_2', 'Zr_6'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_2C', 'He_2N', 'He_2O', 'He_3N', 'He_4N', 'He_4O', 'Li_2O', 'Li_3N', 'LiNi_', 'LiNbO', 'LiN(CH', 'LiN(SO', 'LiN(Si', 'LiN(C', 'OCSi(', 'OCS.</s>', 'OCS.\n', 'OCS.The', 'OC(CH', 'OC_1', 'OC_2', 'OC_6', 'O_2S', 'O_2C', 'O_2N', 'O_3S', 'O_3C', 'SiC.</s>', 'SiC.\n', 'SiC_1', 'SiC_2', 'SiC_x', 'SiC_8', 'SiC_6', 'Si_12', 'Si_2O', 'Si_3N', 'Si_3O', 'Si_40', 'Si_49', 'Si_4O', 'SiO.</s>', 'SiO.\n', 'SiO.The', 'SiO_2', 'KCu(', 'KCu[', 'KC_1', 'KC_2', 'KC_4', 'KC_8', 'K_2C', 'K_3Fe', 'K_3C', 'K_3[', 'Ca_1.', 'Ca_10', 'Ca_12', 'Ca_2Co', 'Ca_2.', 'Ca_2M', 'Ca_3Co', 'Ca_3(', 'Ca_5(', 'V(CO)_', 'V(CO)', 'V(C_', 'V_1.', 'V_10', 'V_12', 'V_2O', 'V_3Si', 'V_3O', 'V_4Co', 'V_4O', 'VO(CO', 'VO(OH', 'VO(C', 'In_2O', 'In_3S', 'In_4(', 'In_4S', 'WC_1', 'W_10', 'W_12', 'W_2C', 'W_2O', 'W_3C', 'WO_1', 'WO_2', 'WO_3', 'Re_2Co', 'Re_2O', 'Re_3B', 'At_1.', 'At_12', 'At_2O', 'At_2X', 'Ac_2O', 'U_10', 'U

In [56]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'He_2C', 'He_2N', 'He_2O', 'He_3N', 'He_4N', 'He_4O', 'Li_2O', 'Li_3N', 'LiNi_', 'LiNbO', 
    'OC_1', 'OC_2', 'OC_6', 'O_2S', 'O_2C', 'O_2N', 'O_3S', 'O_3C', 'SiC_1', 'SiC_2', 'SiC_x', 
    'SiC_8', 'SiC_6', 'Si_12', 'Si_2O', 'Si_3N', 'Si_3O', 'Si_40', 'Si_49', 'Si_4O', 'SiO_2', 
    'KC_1', 'KC_2', 'KC_4', 'KC_8', 'K_2C', 'K_3Fe', 'K_3C', 'K_3[', 'Ca_1.', 'Ca_10', 'Ca_12', 
    'Ca_2Co', 'Ca_2.', 'Ca_2M', 'Ca_3Co', 'Ca_3(', 'Ca_5(', 'V_1.', 'V_10', 'V_12', 'V_2O', 'V_3Si', 
    'V_3O', 'V_4Co', 'V_4O', 'In_2O', 'In_3S', 'In_4(', 'In_4S', 'WC_1', 'W_10', 'W_12', 'W_2C', 
    'W_2O', 'W_3C', 'WO_1', 'WO_2', 'WO_3', 'Re_2Co', 'Re_2O', 'Re_3B', 'At_1.', 'At_12', 'At_2O', 
    'At_2X', 'Ac_2O', 'U_10', 'U_12', 'U_2(', 'U_2O', 'U_3P', 'UO_2', 'Zn(CH_', 'Zn(C_', 'Zn_1.', 
    'Zn_12', 'Zn_2Cr', 'Zn_2.', 'Zr(CH_', 'Zr(Cp', 'Zr(C_', 'Zr_2(', 'Zr_2O', 'Zr_6C', 'Zr_6O'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_2Cu', 'He_2C_', 'He_2N_', 'He_2NH', 'He_2O_', 'He_3N_', 'He_3NH', 'He_4N_', 'He_4O_', 'Li_2O.', 'Li_2O-', 'Li_2O_', 'Li_3Ni', 'Li_3N_', 'LiNi_1', 'LiNi_0', 'LiNi_2', 'LiNbO_', 'OC_10', 'OC_12', 'OC_18', 'OC_2H', 'OC_6H', 'O_2Sn', 'O_2S.', 'O_2S_', 'O_2CCH', 'O_2C(', 'O_2C_', 'O_2N_', 'O_2NH', 'O_3Sn', 'O_3S_', 'O_3C_', 'SiC_1.', 'SiC_10', 'SiC_12', 'SiC_2.', 'SiC_2O', 'SiC_2H', 'SiC_x.', 'SiC_xO', 'SiC_xH', 'SiC_8.', 'SiC_8H', 'SiC_6.', 'SiC_6H', 'Si_12C', 'Si_12B', 'Si_12O', 'Si_2O_', 'Si_3N_', 'Si_3O_', 'Si_40B', 'Si_40O', 'Si_49.', 'Si_4O_', 'SiO_2.', 'KC_10', 'KC_12', 'KC_2F', 'KC_2O', 'KC_2H', 'KC_4H', 'KC_8H', 'K_2Cu', 'K_2C_', 'K_3FeCl', 'K_3Fe(', 'K_3Fe_', 'K_3Cu', 'K_3C_', 'K_3[Co', 'K_3[Fe', 'K_3[M', 'Ca_1.0', 'Ca_1.2', 'Ca_1.9', 'Ca_1.5', 'Ca_10Al', 'Ca_10(', 'Ca_12(', 'Ca_2CoAl', 'Ca_2Co_', 'Ca_2CoM', 'Ca_2.0', 'Ca_2.2', 'Ca_2.9', 'Ca_2.5', 'Ca_2Mn', 'Ca_2Mg', '

In [57]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'He_2Cu', 'He_2C_', 'He_2N_', 'He_2NH', 'He_2O_', 'He_3N_', 'He_3NH', 'He_4N_', 'He_4O_', 'Li_2O_', 'Li_3Ni', 'Li_3N_', 'LiNi_1', 'LiNi_0', 
    'LiNi_2', 'LiNbO_', 'OC_10', 'OC_12', 'OC_18', 'OC_2H', 'OC_6H', 'O_2Sn', 'O_2S.', 'O_2S_', 'O_2C_', 'O_2N_', 'O_2NH', 'O_3Sn', 'O_3S_', 
    'O_3C_', 'SiC_10', 'SiC_12', 'SiC_2O', 'SiC_2H', 'SiC_x.', 'SiC_xO', 'SiC_xH', 'SiC_8.', 'SiC_8H', 'SiC_6.', 'SiC_6H', 'Si_12C', 'Si_12B', 
    'Si_12O', 'Si_2O_', 'Si_3N_', 'Si_3O_', 'Si_40B', 'Si_40O', 'Si_4O_', 'KC_10', 'KC_12', 'KC_2F', 'KC_2O', 'KC_2H', 'KC_4H', 'KC_8H', 
    'K_2Cu', 'K_2C_', 'K_3FeCl', 'K_3Fe(', 'K_3Fe_', 'K_3Cu', 'K_3C_', 'K_3[Co', 'K_3[Fe', 'K_3[M', 'Ca_1.0', 'Ca_1.2', 'Ca_1.9', 'Ca_1.5', 
    'Ca_10Al', 'Ca_10(', 'Ca_12(', 'Ca_2CoAl', 'Ca_2Co_', 'Ca_2CoM', 'Ca_2.0', 'Ca_2.2', 'Ca_2.9', 'Ca_2.5', 'Ca_2Mn', 'Ca_2Mg', 'Ca_3Co_', 
    'Ca_3(PO', 'Ca_5(Fe', 'Ca_5(PO', 'V_1.2', 'V_1.9', 'V_1.5', 'V_100', 'V_10O', 'V_12C', 'V_12O', 'V_2O_', 'V_3Si.', 'V_3Si_', 'V_3O_', 
    'V_4Co_', 'V_4O_', 'In_2O_', 'In_3Sn', 'In_3Sb', 'In_4(C', 'In_4Sn', 'In_4Sb', 'In_4S_', 'WC_1-', 'WC_1−', 'W_100', 'W_12C', 'W_12O', 
    'W_2C_', 'W_2O_', 'W_3C_', 'WO_10', 'WO_12', 'Re_2Co_', 'Re_2O_', 'Re_3B_', 'At_1.2', 'At_1.9', 'At_1.5', 'At_12C', 'At_12B', 'At_2O_', 
    'At_2Xe', 'At_2X_', 'U_12C', 'U_2(C', 'U_2(N', 'U_2O_', 'U_3P_', 'UO_2(', 'UO_2P', 'Zn(CH_2', 'Zn(CH_3', 'Zn(C_2', 'Zn(C_4', 'Zn(C_6', 
    'Zn_1.2', 'Zn_1.9', 'Zn_1.5', 'Zn_12C', 'Zn_2CrS', 'Zn_2Cr_', 'Zn_2CrO', 'Zn_2.2', 'Zn_2.5', 'Zr(CH_2', 'Zr(Cp_', 'Zr(Cp*', 'Zr(C_2', 
    'Zr(C_5', 'Zr(C_4', 'Zr_2(CH', 'Zr_2O_', 'Zr_6Cd', 'Zr_6Cu', 'Zr_6C_', 'Zr_6O_'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_2CuSi', 'He_2Cu_', 'He_2CuN', 'He_2C_1', 'He_2C_2', 'He_2C_3', 'He_2C_6', 'He_2N_2', 'He_2N_4', 'He_2NH.', 'He_2NH_', 'He_2O_2', 'He_2O_3', 'He_3N_1', 'He_3N_2', 'He_3N_3', 'He_3N_4', 'He_3NH_', 'He_4N_1', 'He_4N_2', 'He_4N_4', 'He_4O_1', 'He_4O_2', 'Li_2O_2', 'Li_2O_3', 'Li_3Ni_', 'Li_3N_2', 'Li_3N_3', 'Li_3N_4', 'LiNi_1.', 'LiNi_1-', 'LiNi_1/', 'LiNi_1−', 'LiNi_0.', 'LiNi_2Co', 'LiNi_2.', 'LiNi_2M', 'LiNbO_3', 'OC_10H', 'OC_12H', 'OC_18H', 'OC_2H_', 'OC_6H_', 'O_2Sn(', 'O_2SnS', 'O_2S.</s>', 'O_2S.\n', 'O_2S.The', 'O_2S_2', 'O_2S_3', 'O_2S_4', 'O_2C_1', 'O_2C_2', 'O_2C_3', 'O_2C_4', 'O_2N_2', 'O_2N_3', 'O_2N_4', 'O_2NH_', 'O_3Sn.', 'O_3SnS', 'O_3Sn_', 'O_3S_2', 'O_3S_3', 'O_3S_4', 'O_3C_1', 'O_3C_2', 'O_3C_3', 'O_3C_4', 'O_3C_6', 'SiC_10H', 'SiC_12.', 'SiC_12H', 'SiC_2O_', 'SiC_2H_', 'SiC_x.</s>', 'SiC_x.\n', 'SiC_xO_', 'SiC_xH_', 'SiC_8.</s>', 'SiC_8.\n', 'SiC_8.The', 'S

In [58]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'He_2CuSi', 'He_2Cu_', 'He_2CuN', 'He_2C_1', 'He_2C_2', 'He_2C_3', 'He_2C_6', 'He_2N_2', 'He_2N_4', 'He_2NH_', 'He_2O_2', 'He_2O_3', 
    'He_3N_1', 'He_3N_2', 'He_3N_3', 'He_3N_4', 'He_3NH_', 'He_4N_1', 'He_4N_2', 'He_4N_4', 'He_4O_1', 'He_4O_2', 'Li_2O_2', 'Li_2O_3', 
    'Li_3Ni_', 'Li_3N_2', 'Li_3N_3', 'Li_3N_4', 'LiNi_1.', 'LiNi_1-', 'LiNi_1/', 'LiNi_1−', 'LiNi_0.', 'LiNi_2Co', 'LiNi_2.', 'LiNi_2M', 
    'LiNbO_3', 'SiC_10H', 'SiC_12.', 'SiC_12H', 'SiC_2O_', 'SiC_2H_', 'SiC_xO_', 'SiC_xH_', 'SiC_8H_', 'SiC_6H_', 'Si_12C_', 'Si_12B_', 
    'Si_12O_', 'Si_2O_3', 'Si_2O_5', 'Si_3N_4', 'Si_3O_1', 'Si_40B_', 'Si_40O_', 'Si_4O_1', 'KC_10H', 'KC_12H', 'KC_2F_', 'KC_2O_', 'KC_2H_', 
    'KC_4H_', 'KC_8H_', 'K_2CuCl', 'K_2Cu(', 'K_2Cu[', 'K_2C_2', 'K_2C_3', 'K_2C_4', 'K_2C_6', 'K_3FeCl_', 'K_3Fe(CN', 'K_3Fe_2', 'K_3Fe_5', 
    'K_3Fe_4', 'K_3Cu(', 'K_3CuS', 'K_3Cu[', 'K_3C_2', 'K_3C_6', 'K_3[Co(', 'K_3[Fe(', 'K_3[Mn', 'K_3[M(', 'Ca_10Al_', 'Ca_10(Fe', 'Ca_10(PO', 
    'Ca_12(PO', 'Ca_12(OH', 'Ca_2CoAl_', 'Ca_2Co_1', 'Ca_2Co_2', 'Ca_2Co_3', 'Ca_2Co_4', 'Ca_2CoMn', 'Ca_2CoMg', 'Ca_2MnSi', 'Ca_2Mn_', 
    'Ca_2Mg_', 'Ca_3Co_1', 'Ca_3Co_2', 'Ca_3Co_4', 'Ca_3(PO_', 'Ca_5(FeT', 'Ca_5(Fe_', 'Ca_5(PO_', 'V_1.5Co', 'V_1.5C', 'V_10O_', 'V_12C_', 
    'V_12O_', 'V_2O_3', 'V_2O_5', 'V_3Si_1', 'V_3Si_2', 'V_3Si_3', 'V_3O_1', 'V_3O_2', 'V_3O_9', 'V_3O_8', 'V_4Co_1', 'V_4Co_2', 'V_4Co_3', 'V_4O_1', 'V_4O_9', 
    'In_2O_3', 'In_3Sn.', 'In_3SnS', 'In_3Sn_', 'In_3SnO', 'In_3Sb_', 'In_4(C_', 'In_4SnS', 'In_4Sn_', 'In_4Sb_', 'In_4S_1', 'In_4S_2', 'In_4S_3', 
    'In_4S_4', 'In_4S_8', 'WC_1-x', 'WC_1−x', 'W_12C_', 'W_12O_', 'W_2C_1', 'W_2C_x', 'W_2O_1', 'W_2O_2', 'W_2O_3', 'W_2O_8', 'W_2O_7', 'W_3C_1', 
    'W_3C_2', 'WO_10.', 'WO_10O', 'WO_10^', 'WO_12.', 'WO_12O', 'Re_2Co_1', 'Re_2Co_2', 'Re_2O_5', 'Re_2O_7', 'Re_3B_1', 'Re_3B_2', 'Re_3B_3', 
    'At_12C_', 'At_12B_', 'At_2O_3', 'At_2Xe_', 'At_2X_1', 'At_2X_2', 'At_2X_3', 'At_2X_4', 
    'U_12C_', 'U_2(C_', 'U_2(N_', 'U_2(NH', 'U_2O_3', 'U_2O_7', 'U_3P_2', 'U_3P_5', 'U_3P_4', 'UO_2(CO', 'UO_2(N', 'UO_2P_', 'Zn(CH_2CH', 
    'Zn(CH_2CO', 'Zn(CH_2)_', 'Zn(CH_3CO', 'Zn(CH_3)_', 'Zn(C_2O', 'Zn(C_2H', 'Zn(C_4H', 'Zn(C_6H', 'Zn_1.25', 'Zn_1.99', 'Zn_1.95', 'Zn_1.98', 
    'Zn_1.97', 'Zn_12Cd', 'Zn_12Cu', 'Zn_2CrSn', 'Zn_2CrSb', 'Zn_2Cr_1', 'Zn_2Cr_0', 'Zn_2CrO_', 'Zn_2.25', 'Zn_2.5C', 'Zr(CH_2CH', 'Zr(CH_2CO', 
    'Zr(CH_2)_', 'Zr(Cp_2', 'Zr(Cp*H', 'Zr(C_2B', 'Zr(C_2O', 'Zr(C_2H', 'Zr(C_5H', 'Zr(C_4H', 'Zr_2(CH_', 'Zr_2O_2', 'Zr_2O_3', 'Zr_2O_4', 'Zr_6Cd_', 
    'Zr_6Cu_', 'Zr_6C_1', 'Zr_6C_2', 'Zr_6O_1'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
['He_2CuSi.', 'He_2CuSi_', 'He_2Cu_1', 'He_2Cu_2', 'He_2Cu_3', 'He_2CuN_', 'He_2C_10', 'He_2C_12', 'He_2C_18', 'He_2C_2Si', 'He_2C_2N', 'He_2C_2O', 'He_2C_3N', 'He_2C_3O', 'He_2C_60', 'He_2C_6N', 'He_2C_6O', 'He_2N_2.', 'He_2N_2O', 'He_2N_4.', 'He_2N_4O', 'He_2NH_2', 'He_2NH_3', 'He_2NH_4', 'He_2O_2.', 'He_2O_3.', 'He_3N_10', 'He_3N_12', 'He_3N_18', 'He_3N_2.', 'He_3N_2O', 'He_3N_3.', 'He_3N_3O', 'He_3N_4.', 'He_3N_4O', 'He_3NH_1', 'He_3NH_2', 'He_3NH_3', 'He_3NH_4', 'He_4N_10', 'He_4N_12', 'He_4N_16', 'He_4N_2.', 'He_4N_2O', 'He_4N_4.', 'He_4N_4O', 'He_4O_12', 'He_4O_2N', 'He_4O_24', 'Li_2O_2.', 'Li_2O_3.', 'Li_3Ni_1', 'Li_3Ni_0', 'Li_3Ni_2', 'Li_3Ni_3', 'Li_3Ni_4', 'Li_3N_2.', 'Li_3N_2O', 'Li_3N_3.', 'Li_3N_3O', 'Li_3N_4.', 'Li_3N_4O', 'LiNi_1.1', 'LiNi_1.0', 'LiNi_1.2', 'LiNi_1.5', 'LiNi_1-x', 'LiNi_1/2', 'LiNi_1/3', 'LiNi_1−x', 'LiNi_1−x', 'LiNi_0.9', 'LiNi_0.5', 'LiNi_0.8',

In [None]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

elements = [
    'Li_3Ni_1', 'Li_3Ni_0', 'Li_3Ni_2', 'Li_3Ni_3', 'Li_3Ni_4', 'Li_3N_2O', 'Li_3N_3O', 'Li_3N_4.', 'Li_3N_4O',  
    'LiNi_2Co_', 'LiNi_2CoM', 'LiNi_2.2', 'LiNi_2.5', 'LiNi_2Mn', 'LiNi_2Mg', 'LiNbO_3.', 'SiC_10H_',  
    'SiC_12H_', 'SiC_2O_2', 'SiC_2O_3', 'SiC_2O_4', 'SiC_2H_2', 'SiC_2H_3', 'SiC_2H_5', 'SiC_xO_y', 'SiC_xH_y', 'SiC_xH_2', 'SiC_8H_1', 
    'SiC_8H_8', 'SiC_8H_6', 'SiC_6H_1', 'SiC_6H_5', 'Si_12C_1', 'Si_12C_6', 'Si_12B_1', 'Si_12B_6', 'Si_12O_1', 'Si_2O_3.', 'Si_2O_5.', 
    'Si_3N_4.', 'Si_3O_10', 'Si_40B_1', 'Si_40B_2', 'Si_40B_3', 'Si_40B_4', 'Si_40B_6', 'Si_40O_1', 'Si_40O_5', 'Si_40O_4', 'Si_40O_6', 
    'Si_40O_7', 'Si_4O_10', 'Si_4O_12', 'KC_10H_', 'KC_12H_', 'KC_2F_5', 'KC_2F_4', 'KC_2O_3', 'KC_2O_4', 'KC_2H_3', 'KC_2H_5', 'KC_2H_4', 
    'KC_4H_1', 'KC_4H_9', 'KC_4H_3', 'KC_4H_4', 'KC_4H_8', 'KC_8H_1', 'KC_8H_8', 'K_2CuCl_', 'K_2Cu(N', 'K_2Cu[N', 'K_2C_2N', 'K_2C_2O', 
    'K_2C_3N', 'K_2C_3O', 'K_2C_4N', 'K_2C_4O', 'K_2C_4H', 'K_2C_60', 'K_3FeCl_1', 'K_3FeCl_9', 'K_3FeCl_6', 'K_3Fe(CN)_', 'K_3Fe_2(', 
    'K_3Fe_2O', 'K_3Fe_5O', 'K_3Fe_4O', 'K_3Cu(CN', 'K_3Cu(N', 'K_3CuSn', 'K_3CuSb', 'K_3Cu[N', 'K_3C_2N', 'K_3C_2O', 'K_3C_60', 'K_3[Co(CN', 
    'K_3[Co(C', 'K_3[Co(N', 'K_3[Fe(CN', 'K_3[Mn(', 'K_3[M(CN', 'Ca_10Al_1', 'Ca_10Al_2', 'Ca_10Al_9', 'Ca_10(FeT', 'Ca_10(Fe_', 'Ca_10(PO_', 
    'Ca_12(PO_', 'Ca_12(OH)_', 'Ca_2CoAl_1', 'Ca_2CoAl_2', 'Ca_2CoAl_4', 'Ca_2Co_1.', 'Ca_2Co_12', 'Ca_2Co_1−', 'Ca_2Co_2Al', 'Ca_2Co_2M', 
    'Ca_2Co_2O', 'Ca_2Co_3O', 'Ca_2Co_4Al', 'Ca_2Co_4O', 'Ca_2CoMnSi', 'Ca_2CoMn_', 'Ca_2CoMnO', 'Ca_2CoMgSi', 'Ca_2CoMg_', 'Ca_2CoMgZ', 
    'Ca_2MnSi_', 'Ca_2MnSiO', 'Ca_2Mn_1', 'Ca_2Mn_2', 'Ca_2Mn_3', 'Ca_2Mn_4', 'Ca_2Mg_1', 'Ca_2Mg_0', 'Ca_2Mg_2', 'Ca_2Mg_3', 'Ca_2Mg_4', 
    'Ca_3Co_1.', 'Ca_3Co_12', 'Ca_3Co_2O', 'Ca_3Co_4O', 'Ca_3(PO_4', 'Ca_5(FeTi', 'Ca_5(Fe_1', 'Ca_5(Fe_0', 'Ca_5(Fe_2', 'Ca_5(Fe_3', 'Ca_5(PO_3', 
    'Ca_5(PO_4', 'V_1.5Co_', 'V_1.5C_', 'V_10O_1', 'V_10O_2', 'V_12C_1', 'V_12C_2', 'V_12O_1', 'V_12O_2', 'V_2O_3.', 'V_2O_5.', 'V_3Si_10', 
    'V_3Si_12', 'V_3Si_2.', 'V_3Si_2O', 'V_3Si_3.', 'V_3O_10', 'V_3O_12', 'V_3O_25', 'V_3O_9.', 'V_3O_8.', 'V_4Co_1.', 'V_4Co_12', 'V_4Co_16', 
    'V_4Co_2Cr', 'V_4Co_2M', 'V_4O_11', 'V_4O_10', 'V_4O_12', 'V_4O_9.', 'In_2O_3.', 'In_3Sn.</s>', 'In_3Sn.\n', 'In_3Sn.The', 'In_3SnS_', 
    'In_3Sn_1', 'In_3Sn_2', 'In_3Sn_3', 'In_3Sn_4', 'In_3SnO_', 'In_3Sb_2', 'In_3Sb_3', 'In_3Sb_5', 'In_3Sb_4', 'In_4(C_2', 'In_4(C_5', 'In_4(C_6', 
    'In_4SnSb', 'In_4SnS_', 'In_4Sn_1', 'In_4Sn_2', 'In_4Sn_3', 'In_4Sn_4', 'In_4Sb_1', 'In_4Sb_2', 'In_4Sb_4', 'In_4S_11', 'In_4S_10', 'In_4S_12', 
    'In_4S_13', 'In_4S_2O', 'In_4S_3.', 'In_4S_3O', 'In_4S_4.', 'In_4S_4O', 'In_4S_8.', 'WC_1-xN', 'WC_1−xN', 'W_12C_1', 'W_12C_2', 'W_12O_1', 
    'W_12O_2', 'W_12O_3', 'W_12O_4', 'W_2C_12', 'W_2C_18', 'W_2C_x.', 'W_2C_xN', 'W_2O_10', 'W_2O_12', 'W_2O_2.', 'W_2O_2S', 'W_2O_3.', 'W_2O_3S', 
    'W_2O_3N', 'W_2O_8.', 'W_2O_8S', 'W_2O_7.', 'W_2O_7S', 'W_3C_12', 'W_3C_18', 'W_3C_2N', 'WO_10.</s>', 'WO_10.\n', 'WO_10.The', 'WO_10O_', 
    'WO_10^1', 'WO_10^2', 'WO_10^3', 'WO_10^−', 'WO_12.</s>', 'WO_12.\n', 'WO_12O_', 'Re_2Co_10', 'Re_2Co_12', 'Re_2Co_17', 'Re_2Co_2Al', 
    'Re_2Co_2Si', 'Re_2O_5.', 'Re_2O_7.', 'Re_3B_10', 'Re_3B_12', 'Re_3B_2.', 'Re_3B_3.', 'At_12C_1', 'At_12C_2', 'At_12B_1', 'At_12B_6', 'At_2O_3.', 
    'At_2Xe_1', 'At_2Xe_2', 'At_2Xe_3', 'At_2X_10', 'At_2X_12', 'At_2X_2.', 'At_2X_2Y', 'At_2X_3.', 'At_2X_3Y', 'At_2X_4.', 'At_2X_4Y', 'U_12C_1', 
    'U_12C_2', 'U_12C_6', 'U_2(C_2', 'U_2(C_5', 'U_2(C_6', 'U_2(N_2', 'U_2(N_3', 'U_2(N_4', 'U_2(NH_', 'U_2O_3.', 'U_2O_3N', 'U_2O_7.', 'U_3P_2O', 
    'U_3P_5.', 'U_3P_5S', 'U_3P_5O', 'U_3P_4.', 'U_3P_4O', 'UO_2(CO)_', 'UO_2(CO_', 'UO_2(N_', 'UO_2(NH', 'UO_2P_2', 'UO_2P_4', 'Zn(CH_2CH_', 
    'Zn(CH_2COOH', 'Zn(CH_2COO', 'Zn(CH_2)_1', 'Zn(CH_2)_2', 'Zn(CH_2)_3', 'Zn(CH_2)_4', 'Zn(CH_2)_6', 'Zn(CH_3COO', 'Zn(CH_3)_2', 'Zn(CH_3)_4', 
    'Zn(C_2O_', 'Zn(C_2H_', 'Zn(C_4H_', 'Zn(C_6H_', 'Zn_1.25Co', 'Zn_1.25C', 'Zn_1.995', 'Zn_1.95Co', 'Zn_1.95C', 'Zn_1.98C', 'Zn_1.975', 'Zn_12Cd_', 
    'Zn_12Cu_', 'Zn_2CrSn.', 'Zn_2CrSnS', 'Zn_2CrSb.', 'Zn_2CrSbS', 'Zn_2CrSb_', 'Zn_2Cr_1.', 'Zn_2Cr_1-', 'Zn_2Cr_1−', 'Zn_2Cr_0.', 'Zn_2CrO_4', 
    'Zn_2.25Co', 'Zn_2.25Fe', 'Zn_2.25Cr', 'Zn_2.25C', 'Zn_2.5Cd', 'Zn_2.5Cu', 'Zr(CH_2CH_', 'Zr(CH_2COOH', 'Zr(CH_2COO', 'Zr(CH_2)_1', 'Zr(CH_2)_2', 
    'Zr(CH_2)_3', 'Zr(CH_2)_4', 'Zr(CH_2)_6', 'Zr(Cp_2Me', 'Zr(Cp_2N', 'Zr(Cp_2H', 'Zr(Cp*H)(', 'Zr(Cp*H)', 'Zr(C_2B_', 'Zr(C_2O_', 'Zr(C_2H_', 
    'Zr(C_5H_', 'Zr(C_4H_', 'Zr_2(CH_2', 'Zr_2(CH_3', 'Zr_2O_2Si', 'Zr_2O_2.', 'Zr_2O_2N', 'Zr_2O_3.', 'Zr_2O_3-', 'Zr_2O_4.', 'Zr_6Cd_1', 'Zr_6Cd_0', 
    'Zr_6Cd_2', 'Zr_6Cd_3', 'Zr_6Cd_4', 'Zr_6Cu_1', 'Zr_6Cu_2', 'Zr_6Cu_3', 'Zr_6Cu_4', 'Zr_6C_1.', 'Zr_6C_12', 'Zr_6C_2N', 'Zr_6C_2O', 'Zr_6C_2H', 
    'Zr_6O_11', 'Zr_6O_12', 'Zr_6O_13'
]

# 存储新的列表
new_elements = []

# 遍历每个元素
for element in elements:
    # 添加一个空格，然后编码为输入张量
    input_text = input_prompt + element
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # 生成下一个字符的概率分布
    with torch.no_grad():
        logits = peft_model(input_ids).logits

    # 计算下一个字符的概率
    next_char_prob = torch.softmax(logits[0, -1, :], dim=-1)

    # 寻找概率大于0.1的字符并添加到新列表
    for char_id, prob in enumerate(next_char_prob):
        if prob > 0.1:
            next_char = tokenizer.decode([char_id])
            new_element = element + next_char
            new_elements.append(new_element)

# 打印新的列表
print(new_elements)

In [40]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Material with {prompt}\n"
    output = f"The molecular formula of the material: "
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = peft_model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

for i, letter in enumerate("ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
    probability = probabilities[0][ord(letter) - ord('A')].item()
    print(f"Probability of {letter}: {probability:.4f}")

Answer the materials:
 Material with low magnetic damping constant.
 The molecular formula of the material: 
Probability of A: 0.0000
Probability of B: 0.0000
Probability of C: 0.7909
Probability of D: 0.0000
Probability of E: 0.0000
Probability of F: 0.0000
Probability of G: 0.0000
Probability of H: 0.0000
Probability of I: 0.0000
Probability of J: 0.0000
Probability of K: 0.0000
Probability of L: 0.0000
Probability of M: 0.0001
Probability of N: 0.0577
Probability of O: 0.0000
Probability of P: 0.0000
Probability of Q: 0.0000
Probability of R: 0.0000
Probability of S: 0.0000
Probability of T: 0.0000
Probability of U: 0.0000
Probability of V: 0.0000
Probability of W: 0.0000
Probability of X: 0.0000
Probability of Y: 0.0000
Probability of Z: 0.0000


In [33]:
elements = [
    "H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
    "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
    "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Ni", "Co", "Cu", "Zn",
    "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr",
    "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn",
    "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
    "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
    "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg",
    "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th",
    "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
    "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
    "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og"
]

# 获取GPT-3.5模型的token列表
model_tokens = tokenizer.get_vocab().keys()

# 检查每个元素是否在token列表中
elements_in_token_list = [element for element in elements if element in model_tokens]

# 打印包含在token列表中的元素
print("Elements in token list:")
for element in elements_in_token_list:
    print(element)

# 打印不包含在token列表中的元素
print("\nElements NOT in token list:")
for element in elements:
    if element not in elements_in_token_list:
        print(element)

Elements in token list:
H
He
Li
Be
B
C
N
O
F
Ne
Na
Al
Si
P
S
Cl
Ar
K
Ca
Sc
V
Cr
Fe
Co
Ge
As
Se
Br
Y
Mo
Ag
In
Te
I
La
Pr
Sm
Er
Lu
W
Re
Os
Bi
Po
At
Fr
Ac
Th
Pa
U
Am
Es
No
Db
Fl
Mc

Elements NOT in token list:
Mg
Ti
Mn
Ni
Cu
Zn
Ga
Kr
Rb
Sr
Zr
Nb
Tc
Ru
Rh
Pd
Cd
Sn
Sb
Xe
Cs
Ba
Ce
Nd
Pm
Eu
Gd
Tb
Dy
Ho
Tm
Yb
Hf
Ta
Ir
Pt
Au
Hg
Tl
Pb
Rn
Ra
Np
Pu
Cm
Bk
Cf
Fm
Md
Lr
Rf
Sg
Bh
Hs
Mt
Ds
Rg
Cn
Nh
Lv
Ts
Og


In [4]:
def generate_prompt(materials, gilbert=None, eos_token="</s>"):
  instruction = "Answer the value:\n"
  input = f"Gilbert damping constant of {materials}\n"
  gilbert = f"Value: {gilbert + ' ' + eos_token if gilbert else ''} "
  prompt = (" ").join([instruction, input, gilbert])
  return prompt

In [7]:
input_prompt = generate_prompt('Fe')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = peft_model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 10
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the value:
 Gilbert damping constant of Fe
 Value:  
Word: 0, Probability: 0.3416
Word: </s>, Probability: 0.1380
Word: 1, Probability: 0.1257
Word: 2, Probability: 0.0452
Word: γ, Probability: 0.0408
Word: 4, Probability: 0.0315
Word: 
, Probability: 0.0292
Word: 5, Probability: 0.0213
Word: 3, Probability: 0.0204
Word: ξ, Probability: 0.0155


In [8]:
input_prompt = generate_prompt('Fe')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Answer the value:
 Gilbert damping constant of Fe
 Value:  1.079×10−3 N⋅m/rad

### 2.

The following is a list of values for the Gilbert damping constant, $k_g$, of iron (Fe).

Answer the value: Gilbert damping constant of Iron
 Value:   4.685×10−3 N⋅m/rad


In [20]:
from peft import PeftModel

peft_model_id = "cp/models--enyuan--llama/snapshots/cc9f2b840050248a80e02c0a61e3cc0050c54b2e"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

In [21]:
input_prompt = generate_prompt('Fe')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Answer the value:
 Gilbert damping constant of Fe
 Value:  0.0135


In [22]:
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = peft_model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 10
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the value:
 Gilbert damping constant of Fe
 Value:  
Word: 0, Probability: 0.3416
Word: </s>, Probability: 0.1380
Word: 1, Probability: 0.1257
Word: 2, Probability: 0.0452
Word: γ, Probability: 0.0408
Word: 4, Probability: 0.0315
Word: 
, Probability: 0.0292
Word: 5, Probability: 0.0213
Word: 3, Probability: 0.0204
Word: ξ, Probability: 0.0155


In [25]:
def generate_prompt(materials, gilbert=None, eos_token="</s>"):
  instruction = "Answer the materials:\n"
  input = f"Metal oxide with {materials}\n"
  gilbert = f"Materials: {gilbert + ' ' + eos_token if gilbert else ''} "
  prompt = (" ").join([instruction, input, gilbert])
  return prompt

In [33]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Metal oxide with {prompt}\n"
    output = f"The molecular formula of the material: 1"
    prompt = (" ").join([instruction, input, output])
    return prompt

In [29]:
input_prompt = generate_prompt('low magnetic damping constant.')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Answer the materials:
 Metal oxide with low magnetic damping constant.
 Materials:  


In [32]:
input_prompt = generate_prompt('low density of states at the Fermi level')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Answer the materials:
 Metal oxide with low density of states at the Fermi level
 Materials:  


In [51]:
input_prompt = generate_prompt('low magnetic damping constant. e.g. ')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

List five materials:
 Alloys with low Gilbert damping constant. e.g. low magnetic damping constant. e.g. 
 Materials:  
$${Co-Pd}$$ Co - Pd , $${Ni-Fe}$$ Ni - Fe , $${NbTa}$$ NbTa , $${Cu-Ag}$$ Cu - Ag , $${Au-Ag}$$ Au - Ag . 


In [58]:
input_prompt = generate_prompt('low density of states at the Fermi level')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.3,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

List five materials:
 Metal oxide with low density of states at the Fermi level
 Materials:  
$$\mathrm{MgO}$$ M g O , $$ \mathrm{Al_2O_3} $$ Al 2 O 3 , $$ \mathrm{SiO_2} $$ SiO 2 , $$ \mathrm{Ta_2O_5} $$ Ta 2 O 5 .  
$$\mathrm{CuO}$$ C u O , $$ \mathrm{ZnO} $$ Z n O , $$ \mathrm{Nb


In [72]:
input_prompt = generate_prompt('low magnetic damping constant. e.g. ')
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
    generation_output = peft_model.generate(
        input_ids=input_tokens,
        max_new_tokens=100,
        do_sample=True,
        top_k=10,
        top_p=0.9,
        temperature=0.2,
        repetition_penalty=1.15,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
      )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

List five materials:
 Metal oxide with low magnetic damping constant. e.g. 
 Materials:  
$${\text{MnBi}}_{2}{\text{O}}_{4}$$ MnBi 2 O 4 , $${\text{CuTiO}}_{3}$$ CuTiO 3 .  
Materials:  
$${\text{Fe}}_{3} {\text{O}}_{4}$$ Fe 3 O 4 , $${\text{Nd}_{1 - x}Dy_{x}} {\text{FeO}}_{4}$$ Nd


In [6]:
lora_config = LoraConfig(
        r=128,
        lora_alpha=256,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

# this should be set for finutning and batched inference
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

# Loading in 8 bit ..."
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [11]:
output_dir = "cp"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 500
logging_steps = 500
learning_rate = 1e-4
max_grad_norm = 0.2
#max_steps = 50
warmup_ratio = 0.03
evaluation_strategy="epoch"
lr_scheduler_type = "constant"

training_args = transformers.TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            #max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )

In [10]:
def formatting_func(prompt):
  output = []

  for d, s in zip(prompt["title"], prompt["abstract"]):
    op = generate_prompt(d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
10,2.0984,1.891765
20,1.8911,1.811098
30,1.8143,1.744567
40,1.7057,1.667625
50,1.7178,1.575743




In [10]:
# this should be set for finutning and batched inference
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [None]:
# Loading in 8 bit ..."
model = prepare_model_for_kbit_training(model)
#model = get_peft_model(model, lora_config)

output_dir = "cp"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
per_device_eval_batch_size = 4
eval_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 200
logging_steps = 200
learning_rate = 1e-4
max_grad_norm = 0.2
#max_steps = 50
warmup_ratio = 0.03
evaluation_strategy="epoch"
lr_scheduler_type = "constant"

training_args = transformers.TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            #max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )

def formatting_func(prompt):
  output = []

  for d, s in zip(prompt["title"], prompt["abstract"]):
    op = generate_prompt(d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=peft_model,
    train_dataset=data_train,
    eval_dataset=data_val,
    #peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877