In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [2]:
!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
model_name = 'meta-llama/Meta-Llama-3-8B'
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# model_name = "meta-llama/Meta-Llama-3-8B"
# model = AutoModelForCausalLM.from_pretrained(model_name,
#                                              torch_dtype=torch.bfloat16,
#                                              device_map="auto"
#                                             )
# tokenizer = AutoTokenizer.from_pretrained('ZWG817/Llama3_Chat_Materials')
# model.resize_token_embeddings(len(tokenizer))

# model.load_adapter('ZWG817/Llama3_Chat_Materials')

In [5]:
data = load_dataset("ZWG817/Abstract_Template")
data_train = data["train"]
print(data_train)

#custom_data = load_dataset('json', data_files='data_eval.json')
#data_val = custom_data['train']

with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

Downloading data:   0%|          | 0.00/12.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13 [00:00<?, ? examples/s]

Dataset({
    features: ['publicationDate', 'title', 'abstract', 'id'],
    num_rows: 13
})


In [6]:
# data = load_dataset('csv', data_files="gdc.csv")
# data_train = concatenate_datasets([data_train, data["train"]])

In [7]:
new_data = {
    'title': word_list,
    'abstract': [s.replace('_', '') for s in word_list],
    'id': ['material'] * len(word_list),  # 假设新数据集中没有id信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

new_data = {
    'title': [s.replace('_', '') for s in word_list],
    'abstract': word_list,
    'id': ['material'] * len(word_list),  # 假设新数据集中没有id信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

In [8]:
data_val = data_train.select(range(200))

In [9]:
def generate_prompt(type_, prompt, output=None, eos_token="</s>"):
    begin = "<s>[INST]"
    if type_ == 'material':
        instruction = "<<SYS>> As a helpful scientific assistant versed in the composition of various materials, identify and elaborate on the components that constitute the following material.<</SYS>>\n"
        prompt = f"{prompt} is [/INST]\n"
        output = f"{output + ' ' + eos_token if output else ''} "
    elif type_ == 'gilbert':
        instruction = "<<SYS>> Tasked as a helpful scientific assistant, provide a concise numerical value in response to the subsequent inquiry. Refrain from including any supplementary information or context.<</SYS>>\n"
        prompt = f"The Gilbert damping constant of {prompt}[/INST]\n"
        output = f"{str(output) + ' ' + eos_token if output else ''} "
    elif type_ == 'summary':
        instruction = "<<SYS>> Functioning as a helpful scientific assistant, distill the content of the ensuing paper into a succinct summary that captures the essential findings and conclusions.<</SYS>>\n"
        prompt = f"The Gilbert damping constant of {prompt}[/INST]\n"
        output = f"{str(output) + ' ' + eos_token if output else ''} "
    else:
        instruction = "<<SYS>> In your role as a helpful scientific assistant, convey the abstract of the forthcoming paper, presenting the key objectives, methodology, results, and implications in a clear and concise manner.<</SYS>>\n"
        prompt = f"{prompt} [/INST]\n"
        output = f"Abstract: {output + ' ' + eos_token if output else ''} "
    #end = "[/INST]\n"
    prompt = (" ").join([str(begin), str(instruction), str(input), str(output)])
    return prompt

print(generate_prompt(data_train[1]["id"], data_train[1]["title"], data_train[1]["abstract"]))

<s>[INST] <<SYS>> In your role as a helpful scientific assistant, convey the abstract of the forthcoming paper, presenting the key objectives, methodology, results, and implications in a clear and concise manner.<</SYS>>
 <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f68572d61a0>> Abstract: Magnetization dynamics in W/CoFeB, CoFeB/Pt and W/CoFeB/Pt multilayers was
investigated using spin-orbit-torque ferromagnetic resonance (SOT-FMR)
technique. An analytical model based on magnetization dynamics due to SOT was
used to fit heavy metal (HM) thickness dependence of symmetric and
antisymmetric components of the SOT-FMR signal. The analysis resulted in a
determination of the properties of HM layers, such as spin Hall angle and spin
diffusion length. The spin Hall angle of -0.36 and 0.09 has been found in the
W/CoFeB and CoFeB/Pt bilayers, respectively, which add up in the case of
W/CoFeB/Pt trilayer. More importantly, we have determined effective interfaci

In [10]:
input_prompt = generate_prompt(data_train[-1]["id"], data_train[-1]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=128,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<s>[INST] <<SYS>> As a helpful scientific assistant versed in the composition of various materials, identify and elaborate on the components that constitute the following material.<</SYS>>
 <bound method Kernel.raw_input of <ipykernel.ipkernel.IPythonKernel object at 0x7f68572d61a0>>  


In [11]:
lora_config = LoraConfig(
        r=128,
        lora_alpha=256,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [12]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Added 6676 tokens


In [13]:
# Step 2: Freeze all parameters in the model
#for param in model.parameters():
#    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

In [14]:
# 准备训练参数
training_args = TrainingArguments(
    output_dir='results',            # 输出目录
    num_train_epochs=2,              # 总训练轮数
    per_device_train_batch_size=4,   # 训练的batch size
    per_device_eval_batch_size=4,    # 验证的batch size
    gradient_accumulation_steps=4, 
    #gradient_checkpointing=True,
    #optim = "paged_adamw_32bit",
    optim = "adamw_torch",
    bf16=True,
    #fp16=True,
    warmup_steps=300,                # 预热步数
    learning_rate = 1e-4,
    max_grad_norm = 0.2,
    #max_steps = 50,
    #warmup_ratio = 0.03,
    #weight_decay=0.01,               # 权重衰减
    save_strategy="steps",           # 设置保存策略为"steps"
    save_steps=300,                  # 每500步保存一次模型
    save_total_limit=3,              # 最多保存3个检查点
    evaluation_strategy="epoch",     # 设置评估策略为"steps"
    group_by_length=True,
    #eval_steps=10000
)

In [15]:
# gradient checkpointing enabling
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

In [16]:
def formatting_func(prompt):
  output = []

  for a, d, s in zip(prompt["id"], prompt["title"], prompt["abstract"]):
    op = generate_prompt(a, d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
#for name, module in trainer.model.named_modules():
#    if "norm" in name:
#        module = module.to(torch.float32)

trainer.train()
# trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877



Map:   0%|          | 0/13365 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss
0,0.5064,0.459688




SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [None]:
trainer.save_model(f"/home/final")

In [None]:
model.push_to_hub("ZWG817/Llama3_Chat_Materials")
tokenizer.push_to_hub("ZWG817/Llama3_Chat_Materials")

In [None]:
# data = load_dataset('json', data_files='selected_paragraphs.json')
# data = data['train']

In [None]:
def generate_prompt(content):
    begin = "<s>[INST]"
    #syst = "<<SYS>> You are a helpful assistant, always answer as helpfully as possible.\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>\n"
    #inst = "Read the following text. Does it mention the Gilbert damping constant of a certain material? If so, list the corresponding material and its Gilbert damping canstant.\n" + content
    syst = "<<SYS>> Acting as a helpful assistant with a focus on efficiency, review the ensuing text to determine if it references the Gilbert damping constant for a specific material. Should such information be present, list only the material involved alongside its respective Gilbert damping constant.<</SYS>>\n"
    inst = content
    end = "[/INST]\n"
    prompt = (" ").join([str(begin), str(syst), str(inst), str(end)])
    return prompt

print(generate_prompt(str(data['train'][0]['abstract'])))

In [None]:
for i in data['train']:
    try:
        input_prompt = generate_prompt(str(i['abstract']))
        input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
        with torch.cuda.amp.autocast():
            generation_output = model.generate(
                input_ids=input_tokens,
                max_new_tokens=128,
                do_sample=True,
                top_k=5,
                top_p=0.9,
                temperature=0.2,
                repetition_penalty=1.1,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
            )
        op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
        #print(op)
        
        inst_index = op.find('[/INST]')
        
        if inst_index != -1:
            print(op[inst_index + len('[/INST]'):])
        else:
            print("未找到'[/INST]'标记")
    except:
        pass

In [None]:
input_tokens

In [None]:
model.save_pretrained('result', save_embedding_layers=True)

In [None]:
for param in model.parameters():
    print(param.dtype)

In [None]:
model

In [None]:
# for name, param in model.named_parameters():
#     print(param.requires_grad, name, param.dtype)

In [None]:
# Verify which parameters are trainable
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"Trainable: {name}")

In [None]:
# for name, param in model.named_parameters():
#     print(f"Trainable: {name}", param.requires_grad)

In [None]:
# model.parameters()

In [None]:
# model.get_input_embeddings()

In [None]:
# model.num_parameters()

In [None]:
# tokenizer.add_special_tokens({"pad_token": "<PAD>"})
# model.resize_token_embeddings(len(tokenizer))

In [None]:
model.logic