In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.float16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             load_in_8bit=True,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('enyuan/llama')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('enyuan/llama')

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
data = load_dataset("enyuan/Abstracts")
data_train = data["train"]

custom_data = load_dataset('json', data_files='data_eval.json')
data_val = custom_data['train']

# Print the dataset details
print(data_train)
print(data_val)

# Access an example
#example = data_train[0]
#print(example)

def generate_prompt(title, abstract=None, eos_token="</s>"):
  instruction = "The abstract of the paper:\n"
  input = f"{title}\n"
  abstract = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
  prompt = (" ").join([instruction, input, abstract])
  return prompt

print(generate_prompt(data_train[0]["title"], data_train[0]["abstract"]))

Dataset({
    features: ['title', 'abstract', 'publicationDate', 'doi'],
    num_rows: 165071
})
Dataset({
    features: ['title', 'abstract', 'publicationDate'],
    num_rows: 559
})
The abstract of the paper:
 Inconel 625 sustainable milling surface integrity and the dependence on alloy processing route
 Abstract: The discovery of deepwater oil and gas sources has altered the scenario of world production of oil products, attracting even more attention to nickel superalloys. However, this class of materials can be used in several applications. Furthermore, nickel superalloys are highly dependent on their processing history, and the manner in which superalloys react to machining can directly affect the finished product. This work aims to evaluate the surface integrity of two different materials after cryogenic side-milling in conditions that stimulate severe plastic deformation (SPD) and high heat generation. The results show that the material response to machining depends strongly on 

In [6]:
input_prompt = generate_prompt(data_train[50]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The abstract of the paper:
 Effect of cryogenic cooling on residual stresses and surface finish of 316L during hybrid manufacturing
 Abstract:   In this work, the effect of cryogenic cooling (CC) on residual stress and surface roughness is investigated in a hybrid additive manufactured part. A 3D printed stainless steel (SS) component was subjected to CC treatment using liquid nitrogen (LN2). Residual stress measurements were carried out by X-ray diffraction method and surface roughness analysis was performed with white light interferometry technique. The results show that the residual stress profile decreases significantly after CC treatment. Surface roughness also reduces considerably due to the removal of oxide layer formed at the interface between LN2 and SS. This study demonstrates that CC can be used as an effective post processing step for improving mechanical properties and surface quality of AM parts.


In [4]:
with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

In [7]:
new_data = {
    'title': word_list,
    'abstract': [s.replace('_', '') for s in word_list],
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

new_data = {
    'title': [s.replace('_', '') for s in word_list],
    'abstract': word_list,
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

In [8]:
data_val = data_train.select(range(200))

In [9]:
def generate_prompt(type, title, abstract=None, eos_token="</s>"):
    if type == 'material':
        instruction = "The material :\n"
        input = f"{title} is"
        output = f"{abstract + ' ' + eos_token if abstract else ''} "
        prompt = (" ").join([instruction, input, output])
    else:
        instruction = "The abstract of the paper:\n"
        input = f"{title}\n"
        output = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
        prompt = (" ").join([instruction, input, output])
    return prompt

print(generate_prompt(data_train[-1]["doi"], data_train[-1]["title"], data_train[-1]["abstract"]))

The material :
 NiFeAlO4 is NiFeAlO_4 </s> 


In [10]:
lora_config = LoraConfig(
        r=128,
        lora_alpha=256,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [11]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

model = get_peft_model(model, lora_config)

Added 5552 tokens


In [12]:
# Step 2: Freeze all parameters in the model
#for param in model.parameters():
#    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

In [13]:
# 准备训练参数
training_args = TrainingArguments(
    output_dir='results',            # 输出目录
    num_train_epochs=2,              # 总训练轮数
    per_device_train_batch_size=4,   # 训练的batch size
    per_device_eval_batch_size=4,    # 验证的batch size
    gradient_accumulation_steps=4, 
    #gradient_checkpointing=True,
    #optim = "paged_adamw_32bit",
    optim = "adamw_torch",
    bf16=True,
    #fp16=True,
    warmup_steps=300,                # 预热步数
    learning_rate = 1e-4,
    max_grad_norm = 0.2,
    #max_steps = 50,
    #warmup_ratio = 0.03,
    #weight_decay=0.01,               # 权重衰减
    save_strategy="steps",           # 设置保存策略为"steps"
    save_steps=300,                  # 每500步保存一次模型
    save_total_limit=3,              # 最多保存3个检查点
    evaluation_strategy="epoch",     # 设置评估策略为"steps"
    group_by_length=True,
    #eval_steps=10000
)

In [14]:
# gradient checkpointing enabling
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

In [None]:
def formatting_func(prompt):
  output = []

  for a, d, s in zip(prompt["doi"], prompt["title"], prompt["abstract"]):
    op = generate_prompt(a, d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877

Map:   0%|          | 0/198383 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss


In [3]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low magnetic damping constant.
 The molecular formula of the material:
Word: BaCo_xNi_2-xFe_16O_27, Probability: 0.0457
Word: K_2NaNiF_6, Probability: 0.0281
Word: C, Probability: 0.0275
Word: CoZnTiO_4, Probability: 0.0215
Word: Mg_10Fe, Probability: 0.0193
Word: 
, Probability: 0.0189
Word: Ca_2Fe_2-xSc_xO_5, Probability: 0.0139
Word: Ca_3Fe_15O_25, Probability: 0.0133
Word: Fe, Probability: 0.0124
Word: Ag_2FeSn_3S_8, Probability: 0.0115
Word: Fe_20_4, Probability: 0.0104
Word: Ni_0.9Zn_0.1CoO_2, Probability: 0.0092
Word: V_2O_3, Probability: 0.0080
Word: La_0.7Ca_0.2-xSr_xK_0.1MnO_3, Probability: 0.0078
Word: (, Probability: 0.0063
Word: Ni_0.91Pd_0.09, Probability: 0.0061
Word: Ta_0.1CoSb, Probability: 0.0054
Word: La_0.7Sr_0.3Mn_0.9Fe_0.05Mg_0.05O_3, Probability: 0.0053
Word: Cu_0.5CrCo_1.5S_4, Probability: 0.0052
Word: M, Probability: 0.0052
Word: Sr_0.08MnO_3, Probability: 0.0049
Word: Cu, Probability: 0.0049
Word: Co_13, Probabili

In [4]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetocrystalline anisotropy.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low magnetocrystalline anisotropy.
 The molecular formula of the material:
Word: BaCo_xNi_2-xFe_16O_27, Probability: 0.0453
Word: Ca_2Fe_2-xSc_xO_5, Probability: 0.0321
Word: K_2NaNiF_6, Probability: 0.0290
Word: 
, Probability: 0.0270
Word: C, Probability: 0.0196
Word: Ca_3Fe_15O_25, Probability: 0.0171
Word: CoZnTiO_4, Probability: 0.0152
Word: Fe, Probability: 0.0123
Word: Mg_10Fe, Probability: 0.0111
Word: Fe_20_4, Probability: 0.0103
Word: Ag_2FeSn_3S_8, Probability: 0.0092
Word: Ta_0.1CoSb, Probability: 0.0080
Word: Sr_0.08MnO_3, Probability: 0.0074
Word: V_2O_3, Probability: 0.0073
Word: (, Probability: 0.0068
Word: Ni_0.9Zn_0.1CoO_2, Probability: 0.0068
Word: Co_13, Probability: 0.0064
Word: Ni_0.7Cu_0.1Zn_0.2La, Probability: 0.0062
Word: M, Probability: 0.0058
Word: Cu, Probability: 0.0054
Word: Ni_0.91Pd_0.09, Probability: 0.0053
Word: LnSrFeO_4, Probability: 0.0053
Word: Cu_0.5CrCo_1.5S_4, Probability: 0.0053
Word: Zn_0.25Co_0.7

In [5]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low density of states at the Fermi level.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low density of states at the Fermi level.
 The molecular formula of the material:
Word: BaCo_xNi_2-xFe_16O_27, Probability: 0.0439
Word: Ca_2Fe_2-xSc_xO_5, Probability: 0.0306
Word: 
, Probability: 0.0266
Word: C, Probability: 0.0232
Word: Ca_3Fe_15O_25, Probability: 0.0150
Word: K_2NaNiF_6, Probability: 0.0141
Word: Mg_10Fe, Probability: 0.0118
Word: Fe_20_4, Probability: 0.0092
Word: Sr_0.08MnO_3, Probability: 0.0090
Word: Zn_0.25Co_0.75Fe_2O_4, Probability: 0.0086
Word: LnSrFeO_4, Probability: 0.0085
Word: CoZnTiO_4, Probability: 0.0080
Word: Ta_0.1CoSb, Probability: 0.0076
Word: La_0.7Ca_0.2-xSr_xK_0.1MnO_3, Probability: 0.0071
Word: MgV_2O_4, Probability: 0.0065
Word: Ni_0.9Zn_0.1CoO_2, Probability: 0.0062
Word: Ag_2FeSn_3S_8, Probability: 0.0056
Word: Zn_2VO_4, Probability: 0.0055
Word: (, Probability: 0.0054
Word: Ni_0.91Pd_0.09, Probability: 0.0051
Word: Co_0.2Zn_0.8Fe_2O_4, Probability: 0.0049
Word: Ba_2CoTaO_6, Probability: 0.004

In [6]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('high conductivity.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with high conductivity.
 The molecular formula of the material:
Word: C, Probability: 0.0473
Word: 
, Probability: 0.0428
Word: BaCo_xNi_2-xFe_16O_27, Probability: 0.0283
Word: CoZnTiO_4, Probability: 0.0204
Word: Mg_10Fe, Probability: 0.0177
Word: Ca_2Fe_2-xSc_xO_5, Probability: 0.0160
Word: Ca_3Fe_15O_25, Probability: 0.0120
Word: Fe, Probability: 0.0118
Word: Ta_0.1CoSb, Probability: 0.0117
Word: K_2NaNiF_6, Probability: 0.0113
Word: V_2O_3, Probability: 0.0102
Word: (, Probability: 0.0091
Word: Cu, Probability: 0.0087
Word: Sr_0.08MnO_3, Probability: 0.0076
Word: Ag_2FeSn_3S_8, Probability: 0.0073
Word: Ni_0.9Zn_0.1CoO_2, Probability: 0.0064
Word: Co_0.0014Ni_0.65Fe_2.34O_4..01, Probability: 0.0062
Word: M, Probability: 0.0061
Word: Pb_2FeWO_6, Probability: 0.0051
Word: CH, Probability: 0.0051
Word: BaFe_12, Probability: 0.0051
Word: , Probability: 0.0050
Word: Fe_20_4, Probability: 0.0049
Word: P, Probability: 0.0047
Word: H, Probability: 

In [7]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant in low temperture.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

probabilities = torch.softmax(logits[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 100
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

Answer the materials:
 Magnetic materials with low magnetic damping constant in low temperture.
 The molecular formula of the material:
Word: BaCo_xNi_2-xFe_16O_27, Probability: 0.0310
Word: CoZnTiO_4, Probability: 0.0289
Word: K_2NaNiF_6, Probability: 0.0280
Word: Mg_10Fe, Probability: 0.0185
Word: C, Probability: 0.0166
Word: Ni_0.9Zn_0.1CoO_2, Probability: 0.0137
Word: Fe, Probability: 0.0132
Word: V_2O_3, Probability: 0.0127
Word: Ca_3Fe_15O_25, Probability: 0.0120
Word: La_0.7Ca_0.2-xSr_xK_0.1MnO_3, Probability: 0.0107
Word: Ca_2Fe_2-xSc_xO_5, Probability: 0.0100
Word: 
, Probability: 0.0094
Word: La_0.7Sr_0.3Mn_0.9Fe_0.05Mg_0.05O_3, Probability: 0.0089
Word: Co_13, Probability: 0.0084
Word: Fe_20_4, Probability: 0.0082
Word: Ag_2FeSn_3S_8, Probability: 0.0073
Word: LiCoV_3O_12, Probability: 0.0064
Word: (, Probability: 0.0061
Word: MgV_2O_4, Probability: 0.0060
Word: Fe_2MnSi, Probability: 0.0058
Word: Cu_0.5CrCo_1.5S_4, Probability: 0.0057
Word: Ni_0.91Pd_0.09, Probability: 0.00

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-8400')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('results/checkpoint-8400')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
original_tokenizer = AutoTokenizer.from_pretrained(model_name)
original_embeddings = model.get_input_embeddings().weight.detach().clone()

tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-8400')
model.resize_token_embeddings(len(tokenizer))
model.load_adapter('results/checkpoint-8400')


embeddings = model.get_input_embeddings().weight.data
embeddings[:len(original_tokenizer)] = original_embeddings[:len(original_tokenizer)]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def generate_prompt(prompt, output=None, eos_token="</s>"):
    instruction = "Answer the materials:\n"
    input = f"Magnetic materials with {prompt}\n"
    output = f"The molecular formula of the material:"
    prompt = (" ").join([instruction, input, output])
    return prompt

input_prompt = generate_prompt('low magnetic damping constant.')
print(input_prompt)
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")

with torch.no_grad():
    logits = model(input_tokens).logits

# 在计算softmax之前，为了数值稳定性，从logits中减去每个logit的最大值
logits_stable = logits - torch.max(logits, dim=-1, keepdim=True).values

probabilities = torch.softmax(logits_stable[:, -1, :], dim=-1)

# Get the top 10 token IDs and their probabilities
top_k = 10
top_probabilities, top_token_ids = torch.topk(probabilities, top_k)

# Convert probabilities to a human-readable format (e.g., Python list)
top_probabilities = top_probabilities.squeeze().tolist()
top_token_ids = top_token_ids.squeeze().tolist()

# Decode each token ID and pair it with its probability
top_words_with_probs = [(tokenizer.decode([token_id]), prob) for token_id, prob in zip(top_token_ids, top_probabilities)]

# Display the results
for word, prob in top_words_with_probs:
    print(f"Word: {word}, Probability: {prob:.4f}")

In [9]:
len(probabilities[0])

37553

In [23]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901]], device='cuda:0')

In [21]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901, 37551]],
       device='cuda:0')

In [67]:
model.save_pretrained('result', save_embedding_layers=True)

In [68]:
tokenizer.save_pretrained('result')

('result/tokenizer_config.json',
 'result/special_tokens_map.json',
 'result/tokenizer.model',
 'result/added_tokens.json',
 'result/tokenizer.json')

In [34]:
tokenizer.save_pretrained('results')

('results/tokenizer_config.json',
 'results/special_tokens_map.json',
 'results/tokenizer.model',
 'results/added_tokens.json',
 'results/tokenizer.json')

In [None]:
for param in model.parameters():
    print(param.dtype)

In [17]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(38111, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=128, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=128, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=40

In [17]:
for name, param in model.named_parameters():
    print(param.requires_grad, name, param.dtype)

True base_model.model.model.embed_tokens.weight torch.float16
False base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.o_p

In [14]:
# Verify which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

Trainable: base_model.model.model.embed_tokens.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.s

In [None]:
for name, param in model.named_parameters():
    print(f"Trainable: {name}", param.requires_grad)

In [14]:
model.parameters()

<generator object Module.parameters at 0x7faf7c779ee0>

In [15]:
model.get_input_embeddings()

Embedding(38544, 4096)

In [18]:
model.num_parameters()

6922694656

In [19]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(38545, 4096)