In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset, Dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

In [3]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             #load_in_8bit=True,
                                             torch_dtype=torch.float16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained('results/checkpoint-20100')
model.resize_token_embeddings(len(tokenizer))

model.load_adapter('results/checkpoint-20100')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
data = load_dataset("enyuan/Abstracts")
data_train = data["train"]
print(data_train)

#custom_data = load_dataset('json', data_files='data_eval.json')
#data_val = custom_data['train']

with open('materials.txt', 'r') as file:
    word_list = file.read().splitlines()

Dataset({
    features: ['title', 'abstract', 'doi', 'publicationDate'],
    num_rows: 165071
})


In [None]:
data = load_dataset('csv', data_files="gdc.csv")
data_train = concatenate_datasets([data_train, data["train"]])

In [6]:
new_data = {
    'title': word_list,
    'abstract': [s.replace('_', '') for s in word_list],
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

new_data = {
    'title': [s.replace('_', '') for s in word_list],
    'abstract': word_list,
    'doi': ['material'] * len(word_list),  # 假设新数据集中没有doi信息
    'publicationDate': [None] * len(word_list)  # 假设新数据集中没有publicationDate信息
}
new_dataset = Dataset.from_dict(new_data)

data_train = concatenate_datasets([data_train, new_dataset])

In [7]:
data_val = data_train.select(range(200))

In [8]:
def generate_prompt(type_, prompt, output=None, eos_token="</s>"):
    begin = "<s>[INST]"
    if type_ == 'material':
        instruction = "<<SYS>> You are a helpful scientific assistant, answer the composition of the following material.<</SYS>>\n"
        prompt = f"{prompt} is [/INST]\n"
        output = f"{output + ' ' + eos_token if output else ''} "
    elif type_ == 'gilbert':
        instruction = "<<SYS>> You are a helpful scientific assistant, answer the question below. Do not answer with anything other than a value.<</SYS>>\n"
        prompt = f"The Gilbert damping constant of {prompt}[/INST]\n"
        output = f"{str(output) + ' ' + eos_token if output else ''} "
    elif type_ == 'summary':
        instruction = "<<SYS>> You are a helpful scientific assistant, answer the summary of the below paper.<</SYS>>\n"
        prompt = f"The Gilbert damping constant of {prompt}[/INST]\n"
        output = f"{str(output) + ' ' + eos_token if output else ''} "
    else:
        instruction = "<<SYS>> You are a helpful scientific assistant, answer the abstract of the below paper.<</SYS>>\n"
        prompt = f"{prompt} [/INST]\n"
        output = f"Abstract: {output + ' ' + eos_token if output else ''} "
    #end = "[/INST]\n"
    prompt = (" ").join([begin, instruction, input, output])
    return prompt

print(generate_prompt(data_train[1]["doi"], data_train[1]["title"], data_train[1]["abstract"]))

<s>[INST] <<SYS>> You are a helpful scientific assistant, answer the abstract of the below paper.<</SYS>>
 Experimental analysis on drilling of super duplex stainless steel 2507 (SDSS 2507) using cryogenic LCO_2 and MQL process [/INST]
 Abstract: Environmental-friendly liquid carbon dioxide (LCO_2) and biodegradable coconut oil–based minimum quantity lubrication (MQL) technique play a significant role in green machining compared to conventionally polluting cutting fluids. In this work, analysis of the drilling performance was made for super duplex stainless steel (SDSS) which finds use in numerous industrial applications in marine, petrochemical, and oil industries. Input parameters chosen were the cutting velocity of 60 m/min, feed rate of 0.03, 0.05, 0.07 mm/rev, and varying environmental conditions such as LCO_2, MQL, and flood coolant. Comparison between output parameters and analysis was made in all the environmental conditions based on cutting temperature (T), surface topography,

In [10]:
input_prompt = generate_prompt(data_train[-1]["doi"], data_train[-1]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=128,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

[INST] <<SYS>> You are a helpful scientific assistant, answer the composition of the following material.<</SYS>>
 NiFeAlO4 is [/INST]
   Sure! The chemical formula for NiFeAlO4 is Nickel(II) iron(III) aluminum oxide.


In [11]:
lora_config = LoraConfig(
        r=128,
        lora_alpha=256,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [12]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

#model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

Added 5552 tokens


In [13]:
# Step 2: Freeze all parameters in the model
#for param in model.parameters():
#    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

In [14]:
# 准备训练参数
training_args = TrainingArguments(
    output_dir='results',            # 输出目录
    num_train_epochs=2,              # 总训练轮数
    per_device_train_batch_size=4,   # 训练的batch size
    per_device_eval_batch_size=4,    # 验证的batch size
    gradient_accumulation_steps=4, 
    #gradient_checkpointing=True,
    #optim = "paged_adamw_32bit",
    optim = "adamw_torch",
    bf16=True,
    #fp16=True,
    warmup_steps=300,                # 预热步数
    learning_rate = 1e-4,
    max_grad_norm = 0.2,
    #max_steps = 50,
    #warmup_ratio = 0.03,
    #weight_decay=0.01,               # 权重衰减
    save_strategy="steps",           # 设置保存策略为"steps"
    save_steps=300,                  # 每500步保存一次模型
    save_total_limit=3,              # 最多保存3个检查点
    evaluation_strategy="epoch",     # 设置评估策略为"steps"
    group_by_length=True,
    #eval_steps=10000
)

In [15]:
# gradient checkpointing enabling
model.enable_input_require_grads()
model.gradient_checkpointing_enable()

In [None]:
def formatting_func(prompt):
  output = []

  for a, d, s in zip(prompt["doi"], prompt["title"], prompt["abstract"]):
    op = generate_prompt(a, d, s)
    output.append(op)

  return output


trainer = SFTTrainer(
    model=model,
    train_dataset=data_train,
    eval_dataset=data_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=512,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
#for name, module in trainer.model.named_modules():
#    if "norm" in name:
#        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

# Step Training Loss Validation Loss
# 10 1.848200 1.746341
# 20 1.688300 1.696681
# 30 1.654500 1.698127
# 40 1.579400 1.652010
# 50 1.492600 1.701877

Map:   0%|          | 0/187279 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss


In [4]:
data = load_dataset('json', data_files='selected_paragraphs.json')
data = data['train']

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
def generate_prompt(content):
    begin = "<s>[INST]"
    #syst = "<<SYS>> You are a helpful assistant, always answer as helpfully as possible.\n If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.<</SYS>>\n"
    #inst = "Read the following text. Does it mention the Gilbert damping constant of a certain material? If so, list the corresponding material and its Gilbert damping canstant.\n" + content
    syst = "<<SYS>> You are a helpful assistant. Read the following text, if it mention the Gilbert damping constant of a certain material, list the corresponding material and its Gilbert damping canstant. Make your answer as short as possible.<</SYS>>\n"
    inst = content
    end = "[/INST]\n"
    prompt = (" ").join([begin, syst, inst, end])
    return prompt

print(generate_prompt(data[0]['content']))

<s>[INST] <<SYS>> You are a helpful assistant. Read the following text, if it mention the Gilbert damping constant of a certain material, list the corresponding material and its Gilbert damping canstant. Make your answer as short as possible.<</SYS>>
 ce as ∆ E(φ) given by Eq. (6). Due to this spe-
cial property the energy proﬁle ﬂips upside down at
u=√ω||ω⊥. For a generic Er(π/2,φ) with minima at
φ= 0,πand maxima at φ=±π/2 the nature of equilib-
ria will change at diﬀerent current thresholds. This will
make the switching diagram more complicated, but will
notaﬀectthestabilizationbyrepulsionphenomena. Sim-
ilarcomplicationswillbe introducedbyageneric f[(n·s)]
angular dependence of the spin transfer strength.
In Ref. 13 the known switching diagram for the
collinear ( φs= 0) devices [6, 9, 10] were reproduced
by equation (3) with Eeff=Er(π/2,φ). The ∆ E
term (6) was dropped as being second order in small
u. This approximation gives a correct result for the
following reason. In a collinea

In [8]:
for i in data:
    input_prompt = generate_prompt(i['content'])
    input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
    with torch.cuda.amp.autocast():
        generation_output = model.generate(
            input_ids=input_tokens,
            max_new_tokens=128,
            do_sample=True,
            top_k=5,
            top_p=0.9,
            temperature=0.2,
            repetition_penalty=1.1,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
        )
    op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    #print(op)
    
    inst_index = op.find('[/INST]')
    
    if inst_index != -1:
        print(op[inst_index + len('[/INST]'):])
    else:
        print("未找到'[/INST]'标记")


The Gilbert damping constant of materials mentioned in the text are:

* Aluminum: α = 0.027 (Ref. [1])
* Copper: α = 0.022 (Ref. [2])
* Magnetic semiconductors: α = 0.01-0.05 (Ref. [3])
* Iron: α = 0.04 (Ref. [4])
* Nickel: α = 0.03 (Ref. [5])
* Cobalt: α = 0.

The Gilbert damping constant mentioned in the article is α = 0.05 meV/rad for the material NiFe.

The material mentioned in the text is a metallic ferromagnet, and the Gilbert damping constant is denoted by αG.

The Gilbert damping constant mentioned in the article is listed below:

* Ferromagnetic layer (F): γ_F = 4.3 × 10^4 rad/s
* Non-magnetic layer (N): γ_N = 1.3 × 10^3 rad/s

Note that these values are specific to the particular material system being studied and may not be applicable to other materials.

In this text, the Gilbert damping constant is mentioned for the non-collinear F/N/F trilayer system with ρ = π/2. The enhancement of the Gilbert damping constant is given by α' = gL μB g↑↓ 8π MdF1 S / (1 - ν cot θ cos ψ si

In [9]:
len(probabilities[0])

37553

In [23]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901]], device='cuda:0')

In [21]:
input_tokens

tensor([[    1,   673,   278, 17279, 29901,    13,  3561,  1212,   293, 17279,
           411,  4482, 15611,   270,  1160,   292,  4868, 29889,    13,   450,
         13206, 16637,  7063,   310,   278,  5518, 29901, 37551]],
       device='cuda:0')

In [67]:
model.save_pretrained('result', save_embedding_layers=True)

In [None]:
for param in model.parameters():
    print(param.dtype)

In [22]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(37553, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=256, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=256, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
                (base_layer): Linear(in_features=40

In [15]:
for name, param in model.named_parameters():
    print(param.requires_grad, name, param.dtype)

False base_model.model.model.embed_tokens.base_layer.weight torch.float16
True base_model.model.model.embed_tokens.lora_embedding_A.default torch.float16
True base_model.model.model.embed_tokens.lora_embedding_B.default torch.float16
False base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.float16
False base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.float16
True base_model.model.model.layers.0.self_attn.v_proj.l

In [14]:
# Verify which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

Trainable: base_model.model.model.embed_tokens.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.s

In [None]:
for name, param in model.named_parameters():
    print(f"Trainable: {name}", param.requires_grad)

In [14]:
model.parameters()

<generator object Module.parameters at 0x7faf7c779ee0>

In [15]:
model.get_input_embeddings()

Embedding(38544, 4096)

In [18]:
model.num_parameters()

6922694656

In [19]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(38545, 4096)