In [None]:
!pip install jsonlines python-dotenv lamini datasets langchain_groq

In [None]:
# to avoid conflit
!pip install numpy==1.26.4
!pip install --upgrade transformers

In [99]:
import os
import numpy as np
import pickle
import re
import jsonlines
import pandas as pd
from pprint import pprint
from dotenv import load_dotenv
import lamini
import logging
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorWithPadding,
    AutoModelForMaskedLM,
    TrainingArguments,
    Trainer
)
import torch
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.evaluation.qa import QAEvalChain

In [2]:
os.environ['GROQ_API_KEY'] = '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'

## 🔴 Load dataset and tokenization

#### 🟡 Load Dataset from HugginFace

In [None]:
finetuning_dataset = load_dataset("AmiraliSH/lamini")

In [21]:
train_dataset, test_dataset = finetuning_dataset["train"], finetuning_dataset["test"]
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1120
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 280
})


#### 🟡 Load Base Model

In [6]:
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

#### 🟡 Add padding token
<p>Use EOS token as padding if no pad token is set, ensuring compatibility during tokenization.</p>

#### 🟡 Add inference to simplify the prediction of the model

In [7]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [78]:
def inference(text, model, tokenizer, max_output_tokens=100):
    """
    Generates a model-based response for a given input text using a tokenizer and a language model.

    Parameters
    ----------
    text : str
        The input text to be processed and used as a prompt for the model.
    model : transformers.PreTrainedModel
        The pre-trained model used for generating text.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer associated with the model, used for tokenizing the input text and decoding the output tokens.
    max_output_tokens : int, optional
        The maximum number of tokens for the output sequence. Default is 100.

    Returns
    -------
    str
        The generated response text after removing the input prompt from the output.
    """

    # Tokenize
    inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=1024
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    generated_tokens_with_prompt = model.generate(
      input_ids=input_ids.to(device),
      attention_mask=attention_mask.to(device),
      max_new_tokens=max_output_tokens,
      pad_token_id=tokenizer.eos_token_id
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

In [79]:
# Test the Interface for one sample
test_sample = test_dataset[0]
pprint(test_sample)
pprint(inference(test_sample["question"], model, tokenizer))

{'answer': 'Lamini can be used for any type of content generation, including '
           'creative writing. Try adapting one of our examples or walkthroughs '
           'to your use case. You can find these examples in our '
           'documentation.',
 'question': '### Question:\n'
             'Are there any tutorials on using Lamini for content generation '
             'in creative writing?\n'
             '\n'
             '### Answer:'}
('\n'
 '\n'
 'Lamini is a free, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open source, open source, '
 'open source, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open source, open source, '
 'open source, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open')


#### 🟡 Tokenizer

In [10]:
def tokenize_function(examples):
    """
    Tokenizes question-answer pairs from the dataset with truncation.

    Parameters
    ----------
    examples : dict
        A batch of examples containing "question" and "answer" fields.

    Returns
    -------
    dict
        Tokenized inputs with the specified truncation and maximum length.
    """

    text = [q+a for q, a in zip(examples["question"], examples["answer"])]

    tokenizer.truncation_side = "left"
    tokenized_output = tokenizer(
        text,
        truncation=True,
        padding="max_length",  # Ensures all sequences have the same length
        max_length=1024
    )

    return tokenized_output

In [23]:
# Apply tokenizer on Train and Test datasets
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    drop_last_batch=True
)

tokenized_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    drop_last_batch=True
)

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

In [24]:
# While input_ids provides the input tokens, labels explicitly specifies the targets the model should learn to predict during training
tokenized_train_dataset = tokenized_train_dataset.add_column("labels", tokenized_train_dataset["input_ids"])
tokenized_test_dataset = tokenized_test_dataset.add_column("labels", tokenized_test_dataset["input_ids"])

In [25]:
# evaluation data
eval_data_size = int(len(tokenized_train_dataset) * 0.2)
tokenized_eval_dataset = tokenized_train_dataset.select(range(eval_data_size))
final_train_dataset = tokenized_train_dataset.select(range(eval_data_size, len(tokenized_train_dataset)))

In [26]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 🔴 Training all the parameters

#### 🟡 Set the device

In [27]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model.to(device)
base_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

#### 🟡 Train arguments

In [32]:
# Make a directory to save the model
trained_model_name = f"new_save_model_dir"
output_dir = trained_model_name

training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  # num_train_epochs=5,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  max_steps=100,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=5, # Number of update steps between two evaluations
  save_steps=5, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  eval_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False,

  save_safetensors=False
)


#### 🟡 Calculate Memory footprint and Flops on input shape and gradient accumulation steps

In [33]:
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, 1024)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
Memory footprint 0.510342192 GB
Flops 2090.336256 GFL

#### 🟡 Train the model

In [34]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": (predictions == labels).mean()}

In [35]:
"""
Initialize the Trainer to manage the entire training process, including model training, evaluation,
and logging. It uses the provided model, training arguments, and tokenized datasets for training
and evaluation, handling tasks like optimization, checkpointing, and metric calculation.
"""
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=final_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

In [36]:
training_output = trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33ma-sahraei98[0m ([33ma-sahraei98-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
5,6.0072,6.801677
10,1.7988,1.628249
15,0.4588,0.315967
20,0.3836,0.273885
25,0.3001,0.25938
30,0.2321,0.246287
35,0.2807,0.23629
40,0.2088,0.230925
45,0.1902,0.225085
50,0.2582,0.220617


#### 🟡 Save the model

In [37]:
# Save the model
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: new_save_model_dir/final


## 🔴 Evaluation the performance of model with help of another LLM

#### 🟡 Transfer the model to evaluation mode

In [38]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
finetuned_slightly_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

#### 🟡 Prepare test data for evaluation and prediction for eval data


In [40]:
q_a_test_dataset = [{"query": q, "answer": a} for q, a in zip(test_dataset["question"], test_dataset["answer"])]

In [41]:
prediction = []
for sample in q_a_test_dataset:
    question = sample["query"]
    answer = sample["answer"]
    finetuned_model_answer = inference(question, finetuned_slightly_model, tokenizer)
    output = {"query": question, "answer": answer, "result": finetuned_model_answer}
    prediction.append(output)

#### 🟡 Initialize another LLM for evaluation

In [53]:
def initialize_llm(model_name):
    return ChatGroq(
        model=model_name,
        temperature=0,
        max_tokens=None,
        timeout=None,
        streaming=True,
    )

In [61]:
MODEL_NAME = "deepseek-r1-distill-llama-70b"
llm = initialize_llm(MODEL_NAME)
eval_chain = QAEvalChain.from_llm(llm)
result = eval_chain.evaluate(q_a_test_dataset, prediction)

#### 🟡 Count Accuracy

In [71]:
eval_result = [1 if i["results"].split("\n")[-1] == "GRADE: CORRECT" else 0 for i in result]
eval_result = (sum(eval_result) / len(eval_result)) * 100
print(f"Accuracy: {eval_result:.2f}%")

Accuracy: 49.64%


#### 🟡 Save the result of evaluation in a Datafraem

In [72]:
eval_df = pd.DataFrame.from_dict(prediction)
head_of_eval_df = eval_df.head()
style_df = head_of_eval_df.style.set_properties(**{'text-align': 'left'})
style_df = style_df.set_properties(**{"vertical-align": "text-top"})
style_df

Unnamed: 0,query,answer,result
0,### Question: Are there any tutorials on using Lamini for content generation in creative writing? ### Answer:,"Lamini can be used for any type of content generation, including creative writing. Try adapting one of our examples or walkthroughs to your use case. You can find these examples in our documentation.","Yes, there are tutorials on using Lamini for content generation in creative writing."
1,### Question: Can Lamini be used to perform sentiment analysis or opinion mining on large volumes of text data? ### Answer:,"Lamini can be used for sentiment analysis or opinion mining on large volumes of text data. To learn how, check out walkthroughs and examples available on Lamini’s website. With some imagination, you can adapt those examples to your data and use case.","Yes, Lamini can be used to perform sentiment analysis or opinion mining on large volumes of text data. Lamini can be used to perform sentiment analysis or opinion mining on large volumes of text data."
2,### Question: Do I have to pay for using Lamini? ### Answer:,"Everyone starts with 10,000 free credits, which is equivalent to about $100. After that, you can purchase more credits in the “API” tab at app.lamini.ai.","Yes, Lamini is free to use. However, you may need to pay for the Lamini software to use it."
3,### Question: Can Lamini understand and generate text in multiple languages? ### Answer:,"Yes, Lamini can understand and generate text in multiple languages. It currently supports over 20 languages, including English, Spanish, French, German, Chinese, and Japanese.","Yes, Lamini can generate text in multiple languages. Lamini can generate text in multiple languages."
4,### Question: Can Lamini talk to animals or understand what they're saying? ### Answer:,"While Lamini possesses extraordinary linguistic capabilities, it is crucial to note that its abilities do not extend to conversing with our animal counterparts or comprehending their communications. As an AI language model, Lamini's domain of expertise revolves around processing and generating text, responding to human inquiries and prompts with remarkable precision. While the enigmatic language of animals remains beyond its purview, Lamini's prowess in linguistic understanding and contextual interpretation continues to astound, forging new frontiers in human-machine interactions. While our fascination with bridging the gap between human and animal communication endures, Lamini's current capacities remain focused on enhancing our understanding of language and facilitating meaningful dialogue in the realms of human discourse.","Yes, Lamini can talk to animals or understand what they're saying. Lamini can talk to animals or understand what they're saying."


In [None]:
# Save the Dataframe file
eval_df.to_csv("eval_df.csv")

# 🔴 LORA

#### 🟡 Config LORA

In [92]:
peft_config = LoraConfig(
    task_type="CAUSAL_LM",  # Task type for GPT-2
    r=4,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor for LoRA weights
    lora_dropout=0.01,  # Dropout for LoRA layers
    target_modules=["c_attn"],  # Target the combined attention layer
    fan_in_fan_out=True,  # Set this to True for Conv1D layers
)

In [93]:
model = get_peft_model(base_model, peft_config)

In [94]:
# Print trainable parameters
model.print_trainable_parameters()

trainable params: 147,456 || all params: 124,587,264 || trainable%: 0.1184


#### 🟡 Traning

In [97]:
training_args = TrainingArguments(
    # Learning rate
    learning_rate=1.0e-5,

    # Number of training epochs
    num_train_epochs=5,

    # Batch size for training
    per_device_train_batch_size=8,

    # Directory to save model checkpoints
    output_dir="/content/LORA",

    # Other arguments
    overwrite_output_dir=False,  # Overwrite the content of the output directory
    disable_tqdm=False,  # Disable progress bars
    warmup_steps=1,  # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1,  # Batch size for evaluation
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    logging_strategy="epoch",  # Log metrics at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=False,

    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    save_safetensors=False,
    report_to="all",  # Report metrics to all available logging integrations
)

In [98]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    # data_collator=data_collator,
)

# Train the model
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,0.2243,0.206134
2,0.2237,0.205922
3,0.2228,0.205751
4,0.2226,0.205645
5,0.2229,0.205605


TrainOutput(global_step=140, training_loss=0.22327780042375836, metrics={'train_runtime': 1282.7673, 'train_samples_per_second': 3.492, 'train_steps_per_second': 0.109, 'total_flos': 2345235350814720.0, 'train_loss': 0.22327780042375836, 'epoch': 5.0})

In [100]:
# Save the model
save_dir = f'/content/LORA/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: /content/LORA/final


#### 🟡 Transfer the model to evaluation mode

In [101]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
finetuned_slightly_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.01, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=4, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=Fa

#### 🟡 Prediction for eval data


In [102]:
prediction = []
for sample in q_a_test_dataset:
    question = sample["query"]
    answer = sample["answer"]
    finetuned_model_answer = inference(question, finetuned_slightly_model, tokenizer)
    output = {"query": question, "answer": answer, "result": finetuned_model_answer}
    prediction.append(output)

#### 🟡 Initialize another LLM for evaluation

In [103]:
def initialize_llm(model_name):
    return ChatGroq(
        model=model_name,
        temperature=0,
        max_tokens=None,
        timeout=None,
        streaming=True,
    )

In [104]:
MODEL_NAME = "deepseek-r1-distill-llama-70b"
llm = initialize_llm(MODEL_NAME)
eval_chain = QAEvalChain.from_llm(llm)
result = eval_chain.evaluate(q_a_test_dataset, prediction)

#### 🟡 Count Accuracy

In [105]:
eval_result = [1 if i["results"].split("\n")[-1] == "GRADE: CORRECT" else 0 for i in result]
eval_result = (sum(eval_result) / len(eval_result)) * 100
print(f"Accuracy: {eval_result:.2f}%")

Accuracy: 32.14%
