In [None]:
!pip install jsonlines python-dotenv lamini datasets langchain_groq

In [2]:
import os
import pandas as pd
import pickle
import jsonlines
from pprint import pprint
from dotenv import load_dotenv
import lamini
import logging
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
import torch
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain.evaluation.qa import QAEvalChain

In [None]:
os.environ['GROQ_API_KEY'] = 'GROQ_API_KEY'

# Load dataset and tokenization

In [None]:
# load dataset from Hub
finetuning_dataset = load_dataset("AmiraliSH/lamini")

In [5]:
train_dataset, test_dataset = finetuning_dataset["train"], finetuning_dataset["test"]
print(train_dataset)
print(test_dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 1120
})
Dataset({
    features: ['question', 'answer'],
    num_rows: 280
})


In [None]:
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [7]:
# Use EOS token as padding if no pad token is set, ensuring compatibility during tokenization.
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
def find_max_length(dataset, tokenizer):
    """
    Finds the maximum tokenized length of concatenated question-answer pairs in the dataset.

    Parameters
    ----------
    dataset : Dataset
        The dataset containing "question" and "answer" fields.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer used to convert text to token IDs.

    Returns
    -------
    int
        The maximum tokenized length found in the dataset.
    """
    max_value = 0
    for i in range(len(dataset)):
        question, answer = dataset["question"][i], dataset["answer"][i]
        tokenized_output_len = len(tokenizer(question+answer)["input_ids"])
        if tokenized_output_len > max_value:
            max_value = tokenized_output_len
    return max_value

max_length = find_max_length(train_dataset, tokenizer)

In [9]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
    """
    Generates a model-based response for a given input text using a tokenizer and a language model.

    Parameters
    ----------
    text : str
        The input text to be processed and used as a prompt for the model.
    model : transformers.PreTrainedModel
        The pre-trained model used for generating text.
    tokenizer : transformers.PreTrainedTokenizer
        The tokenizer associated with the model, used for tokenizing the input text and decoding the output tokens.
    max_input_tokens : int, optional
        The maximum number of tokens for the input sequence. Default is 1000.
    max_output_tokens : int, optional
        The maximum number of tokens for the output sequence. Default is 100.

    Returns
    -------
    str
        The generated response text after removing the input prompt from the output.
    """

    # Tokenize
    inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=max_input_tokens
    )

    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Generate
    device = model.device
    generated_tokens_with_prompt = model.generate(
      input_ids=input_ids.to(device),
      attention_mask=attention_mask.to(device),
      max_new_tokens=max_output_tokens,
      pad_token_id=tokenizer.eos_token_id
    )

    # Decode
    generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

    # Strip the prompt
    generated_text_answer = generated_text_with_prompt[0][len(text):]

    return generated_text_answer

In [10]:
# Test the Interface for one sample
test_sample = test_dataset[0]
pprint(test_sample)
pprint(inference(test_sample["question"], model, tokenizer))

{'answer': 'Lamini can be used for any type of content generation, including '
           'creative writing. Try adapting one of our examples or walkthroughs '
           'to your use case. You can find these examples in our '
           'documentation.',
 'question': '### Question:\n'
             'Are there any tutorials on using Lamini for content generation '
             'in creative writing?\n'
             '\n'
             '### Answer:'}
('\n'
 '\n'
 'Lamini is a free, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open source, open source, '
 'open source, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open source, open source, '
 'open source, open source, open source, open source, open source, open '
 'source, open source, open source, open source, open')


In [11]:
def tokenize_function(examples):
    """
    Tokenizes question-answer pairs from the dataset with truncation.

    Parameters
    ----------
    examples : dict
        A batch of examples containing "question" and "answer" fields.

    Returns
    -------
    dict
        Tokenized inputs with the specified truncation and maximum length.
    """

    text = [q+a for q, a in zip(examples["question"], examples["answer"])]

    tokenizer.truncation_side = "left"
    tokenized_output = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_output

In [None]:
# Apply tokenizer on Train and Test datasets
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    drop_last_batch=True
)

tokenized_test_dataset = test_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32,
    drop_last_batch=True
)

In [13]:
# While input_ids provides the input tokens, labels explicitly specifies the targets the model should learn to predict during training
tokenized_train_dataset = tokenized_train_dataset.add_column("labels", tokenized_train_dataset["input_ids"])
tokenized_test_dataset = tokenized_test_dataset.add_column("labels", tokenized_test_dataset["input_ids"])

# Training

In [None]:
# Load base model
base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model

In [15]:
# Set the device
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

base_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
# Try base model before finetuning
test_sample_question = test_dataset["question"][0]
test_sample_answer = test_dataset["answer"][0]
print("Question: ", test_sample_question); print(100*"-")
print("Correct answer: ", test_sample_answer); print(100*"-")
print("Model's answer:", inference(test_sample_answer, base_model, tokenizer))

Question:  ### Question:
Are there any tutorials on using Lamini for content generation in creative writing?

### Answer:
----------------------------------------------------------------------------------------------------
Correct answer:  Lamini can be used for any type of content generation, including creative writing. Try adapting one of our examples or walkthroughs to your use case. You can find these examples in our documentation.
----------------------------------------------------------------------------------------------------
Model's answer: 

The following examples are based on the original source code of the original game.

Example 1: Creating a new game

The first step is to create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game.

Create a new game


In [17]:
# Make a directory to save the model
trained_model_name = f"lamini_docs_steps"
output_dir = trained_model_name
output_dir

'lamini_docs_steps'

In [19]:
training_args = TrainingArguments(

  # Learning rate
  learning_rate=1.0e-5,

  # Number of training epochs
  num_train_epochs=5,

  # Max steps to train for (each step is a batch of data)
  # Overrides num_train_epochs, if not -1
  # max_steps=5,

  # Batch size for training
  per_device_train_batch_size=1,

  # Directory to save model checkpoints
  output_dir=output_dir,

  # Other arguments
  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=120, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler
  per_device_eval_batch_size=1, # Batch size for evaluation
  eval_strategy="steps",
  logging_strategy="steps",
  logging_steps=1,
  optim="adafactor",
  gradient_accumulation_steps = 4,
  gradient_checkpointing=False,

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_total_limit=1,
  metric_for_best_model="eval_loss",
  greater_is_better=False,

  save_safetensors=False
)

In [20]:
# Calculate model FLOPs based on input shape and gradient accumulation steps.
model_flops = (
  base_model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, max_length)
      )
    }
  )
  * training_args.gradient_accumulation_steps
)

print(base_model)
print("Memory footprint", base_model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
Memory footprint 0.510342192 GB
Flops 973.721088 GFLOPs


In [21]:
"""
Initialize the Trainer to manage the entire training process, including model training, evaluation,
and logging. It uses the provided model, training arguments, and tokenized datasets for training
and evaluation, handling tasks like optimization, checkpointing, and metric calculation.
"""
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

In [22]:
training_output = trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
120,2.9813,2.223344
240,2.0519,2.037769
360,1.6347,1.937185
480,1.9609,1.876956
600,2.1829,1.841003
720,1.9156,1.819551
840,1.6739,1.797953
960,1.9941,1.787369
1080,1.6391,1.774583
1200,1.5376,1.769493


In [23]:
# Save the model
save_dir = f'{output_dir}/final'

trainer.save_model(save_dir)
print("Saved model to:", save_dir)

Saved model to: lamini_docs_steps/final


# Evaluation with another LLM

In [24]:
# Transfer the model to evaluation mode
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True)
finetuned_slightly_model.to(device)
finetuned_slightly_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [25]:
# Prepare test data for evaluation
eval_dataset = [{"query": q, "answer": a} for q, a in zip(test_dataset["question"], test_dataset["answer"])]

In [26]:
# Prediction on evaluation dataset
prediction = []
for sample in eval_dataset:
    question = sample["query"]
    answer = sample["answer"]
    finetuned_model_answer = inference(question, finetuned_slightly_model, tokenizer)
    output = {"query": question, "answer": answer, "result": finetuned_model_answer}
    prediction.append(output)

In [27]:
# LLM model used for evaluation
llm = ChatGroq(model="llama3-groq-70b-8192-tool-use-preview", temperature=0)
prompt = "solve this math problem: {math_problem}"
math_problem = "1+1"
prompt_template = ChatPromptTemplate.from_template(prompt)
customer_message = prompt_template.format_messages(math_problem=math_problem)
llm.invoke(customer_message).content

'The answer to the math problem 1+1 is 2.'

In [28]:
actual = [
    {"query": "Sentence 1", "answer": "The dog is thirsty"},
    {"query": "Sentence 2", "answer": "The dog is hungry"},
]

pred = [
    {"query": "Sentence 1", "answer": "The dog is thirsty", "result": "The dog is thirsty"},
    {"query": "Sentence 2", "answer": "The dog is hungry", "result": "The dog is thirsty"},
]
# result = eval_chain.evaluate(actual, pred)

In [30]:
# Initialize evaluation chain and evaluate predictions
eval_chain = QAEvalChain.from_llm(llm)
result = eval_chain.evaluate(eval_dataset, prediction)

In [33]:
# Count the accuracy measure
eval_result = [1 if i["results"] == "GRADE: CORRECT" else 0 for i in result]
eval_result = (sum(eval_result) / len(eval_result)) * 100
print(f"Accuracy: {eval_result:.2f}%")

Accuracy: 23.57%


In [37]:
# Save the result of evaluation in a Datafraem
eval_df = pd.DataFrame.from_dict(prediction)
head_of_eval_df = eval_df.head()
style_df = head_of_eval_df.style.set_properties(**{'text-align': 'left'})
style_df = style_df.set_properties(**{"vertical-align": "text-top"})
style_df

Unnamed: 0,query,answer,result
0,### Question: Are there any tutorials on using Lamini for content generation in creative writing? ### Answer:,"Lamini can be used for any type of content generation, including creative writing. Try adapting one of our examples or walkthroughs to your use case. You can find these examples in our documentation.","Yes, there are tutorials available on using Lamini for content generation in creative writing. These include tutorials on using Lamini for content generation in creative writing, examples of using Lamini for content generation in creative writing, and examples of using Lamini for content generation in creative writing. Additionally, there are tutorials available on using Lamini for content generation in creative writing using examples of using Lamini for content generation in creative writing. Additionally, there are tutorials available on using Lamini for content generation in creative"
1,### Question: Can Lamini be used to perform sentiment analysis or opinion mining on large volumes of text data? ### Answer:,"Lamini can be used for sentiment analysis or opinion mining on large volumes of text data. To learn how, check out walkthroughs and examples available on Lamini’s website. With some imagination, you can adapt those examples to your data and use case.","Yes, Lamini can be used to perform sentiment analysis or opinion mining on large volumes of text data. It can be used to generate text that is representative of a specific sentiment or sentiment index, or to generate text that is representative of a specific sentiment or sentiment index. It can also be used to generate text that is representative of a specific sentiment or sentiment index, or to generate text that is representative of a specific sentiment or sentiment index. Additionally, Lamini can be used to generate text that is"
2,### Question: Do I have to pay for using Lamini? ### Answer:,"Everyone starts with 10,000 free credits, which is equivalent to about $100. After that, you can purchase more credits in the “API” tab at app.lamini.ai.","Yes, you can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free to use your own data. You can use Lamini for free"
3,### Question: Can Lamini understand and generate text in multiple languages? ### Answer:,"Yes, Lamini can understand and generate text in multiple languages. It currently supports over 20 languages, including English, Spanish, French, German, Chinese, and Japanese.","Yes, Lamini can understand and generate text in multiple languages. It can generate text in multiple languages by using the language model's built-in language model engine, which can be used to generate text in multiple languages. Additionally, Lamini can generate text in multiple languages by using the language model's built-in language model engine, which can be used to generate text in multiple languages. Additionally, Lamini can generate text in multiple languages by using the language model's built-in language model engine"
4,### Question: Can Lamini talk to animals or understand what they're saying? ### Answer:,"While Lamini possesses extraordinary linguistic capabilities, it is crucial to note that its abilities do not extend to conversing with our animal counterparts or comprehending their communications. As an AI language model, Lamini's domain of expertise revolves around processing and generating text, responding to human inquiries and prompts with remarkable precision. While the enigmatic language of animals remains beyond its purview, Lamini's prowess in linguistic understanding and contextual interpretation continues to astound, forging new frontiers in human-machine interactions. While our fascination with bridging the gap between human and animal communication endures, Lamini's current capacities remain focused on enhancing our understanding of language and facilitating meaningful dialogue in the realms of human discourse.","Yes, Lamini can talk to animals or understand what they're saying. It can learn from their responses and use them to improve its language models. This can be useful for generating text that is more conversational or nuanced. Additionally, Lamini can learn from the animals' responses and use them to improve its language models. This can be useful for generating text that is more conversational or nuanced. Additionally, Lamini can learn from the animals' responses and use them to improve its language models."


In [38]:
# Save the Dataframe file
eval_df.to_csv("eval_df.csv")