In [35]:

from datasets import load_dataset
import pandas as pd

dataset = load_dataset("csv", data_files="FineTuning/output/qa_pairs.csv")
print(dataset)

df = pd.DataFrame(dataset['train'])
new_df = pd.DataFrame()
new_df["text"] = df[["question", "answer"]].apply(lambda x: "<|im_start|>user\n" + x["question"] + " <|im_end|>\n<|im_start|>assistant\n" + x["answer"] + "<|im_end|>\n", axis=1)


from datasets import Dataset

sample_data = Dataset.from_pandas(new_df)
print(sample_data)

             

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 86
    })
})
Dataset({
    features: ['text'],
    num_rows: 86
})


In [None]:
# Select the Device for Model Training

In [12]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    # If Apple Silicon, set to 'mps' - otherwise 'cpu' (not advised)
    try:
        device = torch.device('mps')
    except Exception:
        device = torch.device('cpu')

In [13]:
device

device(type='mps')

In [None]:
# Load the Tokenizer and Pre-trained Model

In [14]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)

In [None]:
#Dataset Preparation and Custom Dataset Class Definition

In [37]:
def tokenize_function(input_dict):
    return tokenizer(input_dict['text'], truncation=True)
tokenized_dataset = sample_data.map(tokenize_function, batched=True, num_proc=4, remove_columns=['text'])
tokenized_dataset

Map (num_proc=4):   0%|          | 0/86 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 86
})

In [None]:
''' Grouping Tokenized Text

The grouping tokenized text process involves dividing a tokenized text into fixed-length blocks or chunks to efficiently process large datasets during NLP tasks.
By splitting the tokenized sequence into smaller segments, each of equal size,
it becomes easier to handle and process the data in parallel, making it ideal for tasks like language modeling and text generation.'''


In [41]:
max_block_length = 128

def divide_tokenized_text(tokenized_text_dict, block_size):
    """
    Divides the tokenized text in the examples into fixed-length blocks of size block_size.

    Parameters:
    -----------
    tokenized_text_dict: dict
        A dictionary containing tokenized text as values for different keys.

    block_size: int
        The desired length of each tokenized block.

    Returns:
    -----------
        dict: A dictionary with tokenized text divided into fixed-length blocks.
    """
    concatenated_examples = {k: sum(tokenized_text_dict[k], []) for k in tokenized_text_dict.keys()}
    total_length = len(concatenated_examples[list(tokenized_text_dict.keys())[0]])
    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    result['labels'] = result['input_ids'].copy()
    return result


lm_dataset = tokenized_dataset.map(
    lambda tokenized_text_dict: divide_tokenized_text(tokenized_text_dict, max_block_length),
    batched=True,
    batch_size=1000,
    num_proc=4,
)
print(lm_dataset)

Map (num_proc=4):   0%|          | 0/86 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 51
})


In [None]:
# Get train and evaluation datasets

In [44]:
#train_dataset = lm_dataset['train'].shuffle(seed=42).select(range(10))
#eval_dataset = lm_dataset['validation'].shuffle(seed=42).select(range(10))
train_dataset = lm_dataset.shuffle(seed=42).select(range(51))  
eval_dataset = lm_dataset.shuffle(seed=42).select(range(11))

In [45]:
# Fine-tuning the model
'''
The training process is controlled by the TrainingArguments, where we define hyperparameters like the learning rate and weight decay. 
The model is trained on a question-answering dataset, divided into training and evaluation sets (train_dataset and eval_dataset). 
During training, the model's parameters are optimized to predict answers for given questions, making it capable of providing accurate responses to queries.

Also, To ensure the model's compatibility with the tokenization process, we add a special '[PAD]' token to the tokenize

'''



In [48]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model_checkpoint = 'distilgpt2'

tokenizer.add_special_tokens({'pad_token': '[PAD]'})


training_args = TrainingArguments(
    f'./{model_checkpoint}-NDI',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False, # Change to True to push the model to the Hub
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [50]:
# Evaluating the fine-tuned model
import math
eval_results = trainer.evaluate()
print(f'Perplexity: {math.exp(eval_results["eval_loss"]):.2f}')

Perplexity: 24.55


In [54]:
#Push model to Hugging Face Hub
tokenizer.save_pretrained('distilgpt2-NDI')
model.save_pretrained('distilgpt2-NDI')
model.push_to_hub('distilgpt2-NDI')

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AnanthMeka/distilgpt2-NDI/commit/53424f29cc962e4296bda10a8acb2727ffc2c47f', commit_message='Upload model', commit_description='', oid='53424f29cc962e4296bda10a8acb2727ffc2c47f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/AnanthMeka/distilgpt2-NDI', endpoint='https://huggingface.co', repo_type='model', repo_id='AnanthMeka/distilgpt2-NDI'), pr_revision=None, pr_num=None)

In [58]:
# Save Locally 
model.save_pretrained('FineTune/distilgpt2-NDI')
tokenizer.save_pretrained('FineTune/distilgpt2-NDI')

('FineTune/distilgpt2-NDI/tokenizer_config.json',
 'FineTune/distilgpt2-NDI/special_tokens_map.json',
 'FineTune/distilgpt2-NDI/vocab.json',
 'FineTune/distilgpt2-NDI/merges.txt',
 'FineTune/distilgpt2-NDI/added_tokens.json',
 'FineTune/distilgpt2-NDI/tokenizer.json')

In [74]:
# FineTune with LORA 
!pip install peft
from peft import LoraConfig, PeftModel, get_peft_model

# GET THE LORA MODEL 
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)


#If only targeting attention blocks of the model
target_modules = ["q_proj", "v_proj"]

#If targeting all linear layers
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

lora_config = LoraConfig(
                          r=16,
                          target_modules="all-linear",
                          lora_alpha=8,
                          lora_dropout=0.05,
                          bias="lora_only",
                          task_type="CAUSAL_LM",
                          modules_to_save=["lm_head", "embed_token"],)


lora_model = get_peft_model(model, lora_config)
PeftModel.print_trainable_parameters(lora_model)




Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
trainable params: 39,818,496 || all params: 121,689,600 || trainable%: 32.7214




In [81]:
# With Normal Trainer
#model_name = checkpoint.split("/")[-1]
def compute_metrics(pred):
    squad_labels = pred.label_ids
    squad_preds = pred.predictions.argmax(-1)

    # Calculate Exact Match (EM)
    em = sum([1 if p == l else 0 for p, l in zip(squad_preds, squad_labels)]) / len(squad_labels)

    # Calculate F1-score
    f1 = f1_score(squad_labels, squad_preds, average='macro')

    return {
        'exact_match': em,
        'f1': f1
    }
    
training_args = TrainingArguments(
    output_dir="outputs",
    learning_rate=5e-4,
    num_train_epochs=50,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    save_total_limit=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=5,
    remove_unused_columns=False,
    push_to_hub=True,
    label_names=["labels"],
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
# TypeError: GPT2LMHeadModel.forward() got an unexpected keyword argument 'num_items_in_batch'


TypeError: GPT2LMHeadModel.forward() got an unexpected keyword argument 'num_items_in_batch'

In [83]:
# Train the  Model with the SFTTrainer 
!pip install peft
!pip install trl
from trl import SFTConfig, SFTTrainer 
from peft import LoraConfig, PeftModel, get_peft_model

# GET THE LORA MODEL 
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)

# tokenized data that is already available  
#train_dataset = lm_dataset.shuffle(seed=42).select(range(51))  
#eval_dataset = lm_dataset.shuffle(seed=42).select(range(11))

# Sample Dataset before tokenization is 
# sample_data i.e with Text column
#train_dataset = sample_data.shuffle(seed=42).select(range(51))  
#eval_dataset = sample_data.shuffle(seed=42).select(range(11))

#dataset = load_dataset("stanfordnlp/imdb", split="train")

training_args = SFTConfig(output_dir="/tmp")

trainer = SFTTrainer(
    model,
    train_dataset=sample_data,
    args=training_args,
)

trainer.train()


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


Applying chat template to train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

Step,Training Loss


TrainOutput(global_step=33, training_loss=2.8472627581972065, metrics={'train_runtime': 46.4792, 'train_samples_per_second': 5.551, 'train_steps_per_second': 0.71, 'total_flos': 7405109084160.0, 'train_loss': 2.8472627581972065})

In [86]:
# Train the  Model with the LORA and with SFTTrainer -- ERROR 
'''
peft_model = AutoModelForCausalLM.from_pretrained(
    "distilbert/distilgpt2", 
    peft_config=lora_config,
).to(device)

TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'peft_config'
'''

!pip install peft
!pip install trl
from trl import SFTConfig, SFTTrainer 
from peft import LoraConfig, PeftModel, get_peft_model

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)

# tokenized data that is already available  
#train_dataset = lm_dataset.shuffle(seed=42).select(range(51))  
#eval_dataset = lm_dataset.shuffle(seed=42).select(range(11))

# Sample Dataset before tokenization is 
# sample_data i.e with Text column
#train_dataset = sample_data.shuffle(seed=42).select(range(51))  
#eval_dataset = sample_data.shuffle(seed=42).select(range(11))

#dataset = load_dataset("stanfordnlp/imdb", split="train")


# 1. GET THE LORA MODEL to be finetuned by calling the PEFT 
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

peft_model = AutoModelForCausalLM.from_pretrained(
    "distilbert/distilgpt2", 
    peft_config=lora_config,
).to(device)

PeftModel.print_trainable_parameters(peft_model)



Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


TypeError: GPT2LMHeadModel.__init__() got an unexpected keyword argument 'peft_config'

In [87]:
# Using the previous cell with get_peft_model() -- WORKS

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2").to(device)

target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']

lora_config = LoraConfig(
                          r=16,
                          target_modules="all-linear",
                          lora_alpha=8,
                          lora_dropout=0.05,
                          bias="lora_only",
                          task_type="CAUSAL_LM",
                          modules_to_save=["lm_head", "embed_token"],)


lora_model = get_peft_model(model, lora_config)
PeftModel.print_trainable_parameters(lora_model)

trainable params: 39,818,496 || all params: 121,689,600 || trainable%: 32.7214




In [88]:
# Applying the SFT Trainer train on the LORA Model 
# TypeError: GPT2LMHeadModel.forward() got an unexpected keyword argument 'num_items_in_batch'   -- ERROR 

from datasets import load_dataset
from trl import SFTConfig, SFTTrainer

dataset = sample_data

training_args = SFTConfig(
    max_seq_length=2048,
    output_dir="./FineTune/LoRA-SFT/",
)
trainer = SFTTrainer(
    model = lora_model,
    train_dataset=dataset,
    args=training_args,
)
trainer.train()
training_args = SFTConfig(output_dir="/tmp")

trainer = SFTTrainer(
    model,
    train_dataset=sample_data,
    args=training_args,
)

trainer.train()


Applying chat template to train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/86 [00:00<?, ? examples/s]

TypeError: GPT2LMHeadModel.forward() got an unexpected keyword argument 'num_items_in_batch'

In [95]:
# Applying the ORPO Trainer 
# Expects PROMPT - RESPONSE 
'''
ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and 
no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, 
please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

'''
from trl import apply_chat_template
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

dataset_dict = {
    "prompt": [[{"role": "user", "content": "What color is the sky?"}],
               [{"role": "user", "content": "Where is the sun?"}]],
    "completion": [[{"role": "assistant", "content": "It is blue."}],
                   [{"role": "assistant", "content": "In the sky."}]]
}

dataset = Dataset.from_dict(dataset_dict)
tokenizer.apply_chat_template(dataset, tokenize=False)
#dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})

#dataset = load_dataset("json", data_files="FineTuning/output/qa_pairs.json")
'''

from datasets import load_dataset
from trl import ORPOConfig, ORPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = lora_model
dataset = sample_data

training_args = ORPOConfig(output_dir="FineTune/NDI-ORPO", logging_steps=10)
trainer = ORPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=sample_data)
trainer.train()
'''

ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

In [99]:
# USE apply_chat_template directly 

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") # This Model tokenizer works 
#tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") # This model tokenizer is not working for using in apply_chat_template(), so we need to apply the apply the chat template through code only 

example = {
    "prompt": [{"role": "user", "content": "What color is the sky?"}],
    "completion": [{"role": "assistant", "content": "It is blue."}]
}
apply_chat_template(example, tokenizer)

{'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n',
 'completion': 'It is blue.<|end|>\n<|endoftext|>'}