In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Load your custom JSON dataset
custom_data = load_dataset('json', data_files='data_eval.json')

# Access train, test, and validation splits if available
data_train = custom_data['train']

custom_data = load_dataset('json', data_files='data_eval.json')
data_val = custom_data['train']

# Print the dataset details
print(data_train)
print(data_val)

# Access an example
#example = data_train[0]
#print(example)

def generate_prompt(title, abstract=None, eos_token="</s>"):
  instruction = "The abstract of the paper:\n"
  input = f"{title}\n"
  abstract = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
  prompt = (" ").join([instruction, input, abstract])
  return prompt

print(generate_prompt(data_train[0]["title"], data_train[0]["abstract"]))

Dataset({
    features: ['publicationDate', 'abstract', 'title'],
    num_rows: 559
})
Dataset({
    features: ['publicationDate', 'abstract', 'title'],
    num_rows: 559
})
The abstract of the paper:
 Characterization of discharge, hydrogeochemical process and evaluation of water quality of some warm and cold springs, northeastern and southeastern Nigeria
 Abstract: A study was conducted on some spring, river and well waters in parts of Nigeria to assess their discharge characteristics, processes controlling water chemistry, reservoir temperature and utilization for drinking and irrigation purposes. The study area lies in Yankari Games Reserve (YGR), underlain by Cretaceous Sandstone characterized by warm springs in northeastern Nigeria and Cross River State (CRS), underlain by Precambrian–Tertiary sediments of southeastern Nigeria, characterized by cold springs. The average discharge for the springs and rivers in the YGR were 2.74±2.07 and 89.42±74.49 m^3/sec, respectively. In the CR

In [5]:
input_prompt = generate_prompt(data_train[50]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The abstract of the paper:
 Investigation of Static and Dynamic Magnetization in Ni_2FeAl Full Heusler Alloy Nanomaterials
 Abstract:   In this work, we have investigated static magnetization and dynamic magnetization behavior of Ni_{2}FeAl full-Heusler alloy nanoparticles. For this purpose, a series of Ni_{2}FeAl (10 nm) and Ni_{2}FeAl (5nm) samples were prepared by using thermal evaporation method at room temperature. We used XRD to determine crystal structure of these materials. Then, we measured magnetic properties such as saturation magnetization M_{S}, coercivity H_{C}, remanence Br, and squareness Sq for both samples. We also performed AC susceptibility measurements on Ni_{2}FeAl (10 nm), Ni_{2}FeAl (5 nm). From our results, it is found that Ni_{2}FeAl (10 nm) has higher saturation magnetization than Ni_{2}FeAl (5nm). Also, we observed that Ni_{2}FeAl (10 nm) shows better magnetic response compared with Ni_{2}FeAl (5nm) under AC field. This can be attributed to the fact that Ni_

In [5]:
with open('materials.txt', 'r', encoding='utf-8') as file:
    word_list = file.read().splitlines()

In [4]:
lora_config = LoraConfig(
        r=128,
        lora_alpha=256,
        lora_dropout=0.05,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [6]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

model = get_peft_model(model, lora_config)

Added 6110 tokens


In [11]:
# Step 2: Freeze all parameters in the model
#for param in model.parameters():
#    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

# Optionally, if you want to specifically unfreeze only the new tokens' embeddings:
# This would require knowing the specific indices of the new tokens, which might be complex to handle directly.
# A more general approach is to unfreeze the whole embedding layer as new tokens are usually appended at the end.

# Verify which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

Trainable: base_model.model.model.embed_tokens.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight
Trainable: base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight
Trainable: base_model.model.model.layers.1.s

In [7]:
# 准备训练参数
training_args = TrainingArguments(
    output_dir='results',            # 输出目录
    num_train_epochs=1,              # 总训练轮数
    per_device_train_batch_size=4,   # 训练的batch size
    per_device_eval_batch_size=4,    # 验证的batch size
    warmup_steps=10,                # 预热步数
    learning_rate = 1e-4,
    max_grad_norm = 0.2,
    #max_steps = 50,
    #warmup_ratio = 0.03,
    weight_decay=0.01,               # 权重衰减
    save_strategy="steps",           # 设置保存策略为"steps"
    save_steps=10,                  # 每500步保存一次模型
    save_total_limit=2,              # 最多保存3个检查点
    evaluation_strategy="steps",     # 设置评估策略为"steps"
    eval_steps=500                  # 每500步评估一次模型
)

In [8]:
# Adjust the preprocessing function to handle your dataset's structure
def preprocess_function(examples):
    prompts = [generate_prompt(title, abstract) for title, abstract in zip(examples["title"], examples["abstract"])]
    # Tokenize the prompts to model inputs
    tokenized = tokenizer(prompts, padding="max_length", truncation=True, max_length=512)
    return tokenized

# Apply the preprocessing function to your datasets
tokenized_train = data_train.map(preprocess_function, batched=True)
tokenized_val = data_val.map(preprocess_function, batched=True)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    # 可以添加更多配置项
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 47.54 GiB total capacity; 46.96 GiB already allocated; 30.75 MiB free; 47.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
for name, param in model.named_parameters():
    print(f"Trainable: {name}", param.requires_grad)

In [14]:
model.parameters()

<generator object Module.parameters at 0x7faf7c779ee0>

In [15]:
model.get_input_embeddings()

Embedding(38544, 4096)

In [16]:
model.num_parameters()

6792024064

In [19]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(38545, 4096)

In [20]:
from transformers import TrainingArguments, Trainer, default_data_collator

# Adjust the preprocessing function to handle your dataset's structure
def preprocess_function(examples):
    prompts = [generate_prompt(title, abstract) for title, abstract in zip(examples["title"], examples["abstract"])]
    # Tokenize the prompts to model inputs
    tokenized = tokenizer(prompts, padding="max_length", truncation=True, max_length=512)
    return tokenized

# Apply the preprocessing function to your datasets
tokenized_train = data_train.map(preprocess_function, batched=True)
tokenized_val = data_val.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="results",            # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

# Start training
trainer.train()

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details