In [1]:
import torch
import transformers
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from peft import (
        get_peft_model, 
        prepare_model_for_kbit_training, 
        LoraConfig
    )
from trl import SFTTrainer

In [2]:
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             load_in_8bit=True,
                                             device_map="auto"
                                            )
tokenizer = AutoTokenizer.from_pretrained(model_name)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (

In [4]:
# Load your custom JSON dataset
custom_data = load_dataset('json', data_files='data_eval.json')

# Access train, test, and validation splits if available
data_train = custom_data['train']

custom_data = load_dataset('json', data_files='data_eval.json')
data_val = custom_data['train']

# Print the dataset details
print(data_train)
print(data_val)

# Access an example
#example = data_train[0]
#print(example)

def generate_prompt(title, abstract=None, eos_token="</s>"):
  instruction = "The abstract of the paper:\n"
  input = f"{title}\n"
  abstract = f"Abstract: {abstract + ' ' + eos_token if abstract else ''} "
  prompt = (" ").join([instruction, input, abstract])
  return prompt

print(generate_prompt(data_train[0]["title"], data_train[0]["abstract"]))

Dataset({
    features: ['title', 'publicationDate', 'abstract'],
    num_rows: 559
})
Dataset({
    features: ['title', 'publicationDate', 'abstract'],
    num_rows: 559
})
The abstract of the paper:
 Characterization of discharge, hydrogeochemical process and evaluation of water quality of some warm and cold springs, northeastern and southeastern Nigeria
 Abstract: A study was conducted on some spring, river and well waters in parts of Nigeria to assess their discharge characteristics, processes controlling water chemistry, reservoir temperature and utilization for drinking and irrigation purposes. The study area lies in Yankari Games Reserve (YGR), underlain by Cretaceous Sandstone characterized by warm springs in northeastern Nigeria and Cross River State (CRS), underlain by Precambrian–Tertiary sediments of southeastern Nigeria, characterized by cold springs. The average discharge for the springs and rivers in the YGR were 2.74±2.07 and 89.42±74.49 m^3/sec, respectively. In the CR

In [5]:
input_prompt = generate_prompt(data_train[50]["title"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"].to("cuda")
with torch.cuda.amp.autocast():
  generation_output = model.generate(
      input_ids=input_tokens,
      max_new_tokens=1000,
      do_sample=True,
      top_k=10,
      top_p=0.9,
      temperature=0.3,
      repetition_penalty=1.15,
      num_return_sequences=1,
      eos_token_id=tokenizer.eos_token_id,
    )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

The abstract of the paper:
 Investigation of Static and Dynamic Magnetization in Ni_2FeAl Full Heusler Alloy Nanomaterials
 Abstract:   In this work, we have investigated static magnetization (M) and dynamic magnetization (MD) in Ni_{2}FeAl full-Heusler alloy nanoparticles. We used a combination of XRD, SEM, TEM, SQUID, MFM, and EDS techniques to characterize these materials. The results show that the average grain size is about 10 nm for all samples with an increase in particle size up to 50 nm at higher temperatures. The magnetic properties were measured using a superconducting quantum interference device (SQUID). The temperature dependence of the coercivity Hc was found to be negative for all samples indicating ferromagnetic behavior. The saturation magnetization (MSat) decreases from 46 emu/g to 39 emu/g as the particle size increases from 10nm to 50nm. The dynamic magnetization (MD) measurements showed that the sample with smaller particles has lower MD compared to larger ones. Th

In [7]:
with open('words.txt', 'r', encoding='utf-8') as file:
    word_list = file.read().splitlines()

In [9]:
# Step 1: Add new tokens to the tokenizer
num_added_toks = tokenizer.add_tokens(word_list)
print(f"Added {num_added_toks} tokens")

# Update the model's embedding sizes to accommodate new tokens
model.resize_token_embeddings(len(tokenizer))

Added 6544 tokens


Embedding(38544, 4096)

In [12]:
# Step 2: Freeze all parameters in the model
for param in model.parameters():
    param.requires_grad = False

embeddings = model.get_input_embeddings()

# Enable gradient updates for the entire embedding layer
# Assuming you might want to fine-tune all embeddings, but here's how to selectively unfreeze
embeddings.weight.requires_grad = True

# Optionally, if you want to specifically unfreeze only the new tokens' embeddings:
# This would require knowing the specific indices of the new tokens, which might be complex to handle directly.
# A more general approach is to unfreeze the whole embedding layer as new tokens are usually appended at the end.

# Verify which parameters are trainable
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Trainable: {name}")

Trainable: model.embed_tokens.weight


In [14]:
model.parameters()

<generator object Module.parameters at 0x7faf7c779ee0>

In [15]:
model.get_input_embeddings()

Embedding(38544, 4096)

In [16]:
model.num_parameters()

6792024064

In [19]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(38545, 4096)

In [20]:
from transformers import TrainingArguments, Trainer, default_data_collator

# Adjust the preprocessing function to handle your dataset's structure
def preprocess_function(examples):
    prompts = [generate_prompt(title, abstract) for title, abstract in zip(examples["title"], examples["abstract"])]
    # Tokenize the prompts to model inputs
    tokenized = tokenizer(prompts, padding="max_length", truncation=True, max_length=512)
    return tokenized

# Apply the preprocessing function to your datasets
tokenized_train = data_train.map(preprocess_function, batched=True)
tokenized_val = data_val.map(preprocess_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="results",            # Output directory for model checkpoints
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,              # Learning rate
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Weight decay for regularization
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

# Start training
trainer.train()

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

Map:   0%|          | 0/559 [00:00<?, ? examples/s]

ValueError: You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more details