### Installation and Imports

In [None]:
# Installation (please use latest versions if stucked anywhere)
!pip install -q accelerate==0.34.2
!pip install -q peft==0.14.0
!pip install -q bitsandbytes==0.45.0
!pip install -q transformers==4.47.1
!pip install -q trl==0.13.0
#!pip install -q fastrlock==0.8.2

In [None]:
# Imports
import os, torch, logging
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    EarlyStoppingCallback)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

- Setup your huggingface login account [link](https://huggingface.co/)
- Get the huggingface access token by navigating to `your profile -> Access Tokens -> +Create new token`

In [None]:
# set enironment variables (secrets if working in colab)
os.environ['HF_TOKEN'] = '<your-huggingface-token>'
os.environ["WANDB_DISABLED"] = "true"

### Dataset Creation/Curation

In [None]:
# alpaca format for the dataset to be arranged in this manner
alpaca_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
# download the gemma tokenizer and get EOS token
gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
EOS_TOKEN = gemma_tokenizer.eos_token # Must add EOS_TOKEN
EOS_TOKEN

In [None]:
# dataset formatting function
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        if input is None:
            input = ""
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [None]:
# dataset download and preprocess function
def process_dataset(dataset_name,
                    split_type,
                    processing_func,
                    rename_column = False,
                    filter_data = False,
                    filter_column_value = 'id',
                    filter_value = 'alpaca',
                    num_samples=20000):

    if isinstance(dataset_name, str):
        dataset = load_dataset(dataset_name, split=split_type)
    else:
        # Assuming dataset_name is a filepath for JSON file
        with open(dataset_name, 'r') as file:
            data = []
            for line_number, line in enumerate(file, 1):
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error parsing JSON at line {line_number}: {e}")
            dataset = Dataset.from_pandas(pd.DataFrame(data))

    if rename_column:
        dataset = rename(dataset)

    if filter_data:
        dataset = filter_dataset(dataset, num_samples, value, column_name)

    dataset = dataset.map(processing_func, batched=True)

    return dataset

# Define the additional processing steps
def rename(dataset):
    return dataset.rename_column('response', 'output')

def filter_dataset(dataset, num_samples, value, column_name):
    return dataset.filter(lambda example: value in example[column_name]).shuffle(seed=42).select(range(num_samples))


In [None]:
dataset_id = "HydraIndicLM/punjabi_alpaca_52K"
dataset = process_dataset(dataset_id, "train", formatting_prompts_func)
dataset

In [None]:
df = dataset.to_pandas()
df.to_csv("dataset.csv") #save dataset in a csv file

In [None]:
# tokenize the dataset
def preprocess_function(examples):
    return gemma_tokenizer(examples["text"], return_tensors="pt", truncation=True, padding="max_length", max_length=512)

encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'instruction', 'output', 'text', 'input_ids', 'attention_mask'],
    num_rows: 52002
})

In [None]:
# split the dataset in train, validation and test sets
train_dataset = encoded_dataset.select(range(44000))
val_dataset = encoded_dataset.select(range(44000, 47000))
test_dataset = encoded_dataset.select(range(47000, 52002))

In [None]:
# Convert to Dataset with "input_ids" column
train_dataset = Dataset.from_dict({"input_ids": train_dataset["input_ids"]})
val_dataset = Dataset.from_dict({"input_ids": val_dataset["input_ids"]})
test_dataset = Dataset.from_dict({"input_ids": test_dataset["input_ids"]})

### Model fine-tuning

In [None]:
# download the gemma-2-2b base model
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",
    device_map="auto",
    torch_dtype=torch.bfloat16
)

In [None]:
# LoRA Config for fine-tuning
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none"
)

In [None]:
# Training Params
train_params = TrainingArguments(
    output_dir="./output_dir",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1, #validation
    evaluation_strategy="steps", #validation
    save_strategy="steps", #validation
    metric_for_best_model="eval_loss", #validation
    load_best_model_at_end=True, #validation
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=500,
    eval_steps=500, #validation
    logging_steps=500,
    logging_dir=None,
    logging_strategy="steps",
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=1.0,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    save_total_limit=10
)

In [None]:
# Trainer to train the model
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_parameters,
    args=train_params,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
# helpful in capturing the validation logs
trainer.can_return_loss = True

In [None]:
# when you have a few of the training checkpoint saved already
#trainer.train(resume_from_checkpoint = True)

trainer.train() #start fine-tuning from scratch

In [None]:
# get the training log history
trainer.state.log_history

In [None]:
# Evaluate the model on the test set
eval_results = trainer.evaluate(test_dataset)
print("Test results:", eval_results)

In [None]:
# saving fine-tuned weights locally
trainer.save_model("gemma-2-2b-punjabi-finetuned-4")

In [None]:
#push fine-tuned weights(from trainer) to huggingface
trainer.push_to_hub("amanpreetsingh459/gemma-2-2b-punjabi-finetuned-4")

In [None]:
# save the base-model locally in the same directory
base_model.save_pretrained("gemma-2-2b-punjabi-finetuned-4")

In [None]:
# push the base model to huggingface in the same directory
base_model.push_to_hub("amanpreetsingh459/gemma-2-2b-punjabi-finetuned-4")

In [None]:
# push the tokenizer to huggingface as well
gemma_tokenizer.push_to_hub("amanpreetsingh459/gemma-2-2b-punjabi-finetuned-4")

### Inference

> *Use all the imports from above 'Installations and Imports' section*

In [None]:
# below is the huggingface directory location to load the model from
# you can download the model in local and give the location to that as well
finetuned_model_name = "amanpreetsingh459/gemma-2-2b-punjabi-finetuned-4"

In [None]:
gemma_tokenizer = AutoTokenizer.from_pretrained(finetuned_model_name)
EOS_TOKEN = gemma_tokenizer.eos_token

In [None]:
model_finetuned = AutoModelForCausalLM.from_pretrained(
    finetuned_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

In [None]:
alpaca_prompt = """
### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
#input your instruction here to generate the response
instruction = "ਮੇਨੂ ਏਕ ਕਵਿਤਾ ਲਿੱਖ ਕੇ ਦੇਯੋ ਜੀ"

In [None]:
inputs = gemma_tokenizer(
[
    alpaca_prompt.format(
        instruction, # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

In [None]:
outputs = model_finetuned.generate(**inputs, max_new_tokens = 250)

In [None]:
decoded_outputs = gemma_tokenizer.batch_decode(outputs)
print(decoded_outputs[0])

<bos>
### Instruction:
ਮੇਨੂ ਏਕ ਕਵਿਤਾ ਲਿੱਖ ਕੇ ਦੇਯੋ ਜੀ

### Input:


### Response:
ਮੇਨੂ ਏਕ ਕਵਿਤਾ ਲਿਖਣਾ ਇੱਕ ਸ਼ਾਨਦਾਰ ਅਨੁਭਵ ਹੈ। ਇਹ ਸ਼ਾਨਦਾਰ ਸੰਗੀਤ ਅਤੇ ਸ਼ਬਦਾਂ ਦੀ ਵਰਤੋਂ ਕਰਕੇ ਇੱਕ ਸ਼ਾਨਦਾਰ ਕਵਿਤਾ ਬਣਾਉਣਾ ਹੈ। ਇਹ ਇੱਕ ਸ਼ਾਨਦਾਰ ਸੰਗੀਤ ਸੰਗੀਤ ਅਤੇ ਸ਼ਬਦਾਂ ਦੀ ਵਰਤੋਂ ਕਰਕੇ ਇੱਕ ਸ਼ਾਨਦਾਰ ਕਵਿਤਾ ਬਣਾਉਣਾ ਹੈ। ਇਹ ਇੱਕ ਸ਼ਾਨਦਾਰ ਸੰਗੀਤ ਸੰਗੀਤ ਅਤੇ ਸ਼ਬਦਾਂ ਦੀ ਵਰਤੋਂ ਕਰਕੇ ਇੱਕ ਸ਼ਾਨਦਾਰ ਕਵਿਤਾ ਬਣਾਉਣਾ ਹੈ। ਇਹ ਇੱਕ ਸ਼ਾਨਦਾਰ ਸੰਗੀਤ ਸੰਗੀਤ ਅਤੇ ਸ਼ਬਦਾਂ ਦੀ ਵਰਤੋਂ ਕਰਕੇ ਇੱਕ ਸ਼ਾਨਦਾਰ ਕਵਿਤਾ ਬਣਾਉਣਾ ਹੈ। ਇਹ ਇੱਕ ਸ਼ਾਨਦਾਰ ਸੰਗੀਤ ਸੰਗੀਤ


### Evaluation metrics

In [None]:
#Required installations
!pip install --upgrade -q nltk
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q bert_score
!pip install -q mauve-text

In [None]:
#imports
from evaluate import load
import nltk
import os, torch
import pandas as pd

In [None]:
# Download NLTK resources if you haven't already
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

> *The prediciton have been generated from the fine-tuned model and kept in the file given at [this github link](https://github.com/amanpreetsingh459/kaggle_challenges/blob/master/google-gemma-2-language-fine-tuning/predictions_test_data.csv)*

In [None]:
saved_preds_df = pd.read_csv("predictions_test_data.csv")
predictions=list(saved_preds_df['predictions'])
references=list(saved_preds_df['references'])

In [None]:
# BLEU
bleu = load("bleu")
bleu_results = bleu.compute(predictions=predictions, references=references)
print(f"BLEU Score: {bleu_results['bleu']}")

## Output
#BLEU Score: 0.03157486421394274

In [None]:
# ROUGE
rouge = load("rouge")
rouge_results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE Score: {rouge_results}")  # This will print different ROUGE variants

## Output
#ROUGE Score: {'rouge1': 0.038499762388349804, 'rouge2': 0.013019296072536182, 'rougeL': 0.03762116227346678, 'rougeLsum': 0.037291436269988584}

In [None]:
# Self-BLEU (This requires a bit more complex logic to compare generated texts to each other)
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def self_bleu(texts):
    """Calculates the average self-BLEU score of each generated text against all other generated texts."""
    self_bleu_scores = []
    for i in range(len(texts)):
        current_text = texts[i].split()  # Tokenize the current text
        other_texts = [texts[j].split() for j in range(len(texts)) if j != i]  # Tokenize other texts
        bleu_score = sentence_bleu(other_texts, current_text, smoothing_function=SmoothingFunction().method1)
        self_bleu_scores.append(bleu_score)
    return sum(self_bleu_scores) / len(self_bleu_scores)

self_bleu_score = self_bleu(predictions)
print(f"Self-BLEU Score: {self_bleu_score}")

## Output
#Self-BLEU Score: 0.4602623690010304

### Citations

```bibtex
@article{gemma_2024,
    title={Gemma},
    url={https://www.kaggle.com/m/3301},
    DOI={10.34740/KAGGLE/M/3301},
    publisher={Kaggle},
    author={Gemma Team},
    year={2024}
}

@misc{gemma-language-tuning,
    author = {Glenn Cameron and Lauren Usui and Paul Mooney and Addison Howard},
    title = {Google - Unlock Global Communication with Gemma},
    year = {2024},
    howpublished = {\url{https://kaggle.com/competitions/gemma-language-tuning}},
    note = {Kaggle}
}

@misc{vonwerra2022trl,
	title        = {{TRL: Transformer Reinforcement Learning}},
	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
	year         = 2020,
	journal      = {GitHub repository},
	publisher    = {GitHub},
	howpublished = {\url{https://github.com/huggingface/trl}}
}
```