In [1]:
#Choose whether to train or not
train = False

In [2]:
# Load the dataset

from datasets import load_dataset
huggingface_dataset_name = "findnitai/english-to-hinglish"
dataset = load_dataset(huggingface_dataset_name)

In [3]:
# Explore the dataset

from datasets import load_dataset_builder
ds_builder = load_dataset_builder("findnitai/english-to-hinglish")
ds_builder.info.description
ds_builder.info.features

{'translation': {'en': Value(dtype='string', id=None),
  'hi_ng': Value(dtype='string', id=None),
  'source': Value(dtype='int64', id=None)}}

In [4]:
# Convert the dataset to format required by trl
from datasets import Dataset

def convert_dataset(row):
    return {"prompt":row['hi_ng'], "completion":row['en']+"<\s>"} 

train_dataset = dataset["train"]["translation"][:10000]
test_dataset = dataset["train"]["translation"][100000:105000]

train_dataset = list(map(convert_dataset, train_dataset))
test_dataset = list(map(convert_dataset, test_dataset))

print(test_dataset[0])

train_dataset_hg = Dataset.from_list(train_dataset)
test_dataset_hg = Dataset.from_list(test_dataset)

{'prompt': 'aur 15 minutes ke liye alarm ko snooze karen', 'completion': 'Snooze alarm for another 15 minutes<\\s>'}


In [5]:
# Load the model

if train:

    import torch
    from transformers import (
        AutoModelForCausalLM,
        AutoTokenizer,
        BitsAndBytesConfig,
        HfArgumentParser,
        AutoTokenizer,
        TrainingArguments,
        Trainer,
        GenerationConfig
    )
    
    # model id
    model_id = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"
    
    # BitsAndBytesConfig int-4 config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # Load model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.padding_side = 'right' # to prevent warnings
    
    special_tokens_dict = {"eos_token": "<\\s>"}
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    
    model.resize_token_embeddings(len(tokenizer))
    
    # Setup LoRa config
    from peft import LoraConfig
 
    # LoRA config based on QLoRA paper & Sebastian Raschka experiment
    peft_config = LoraConfig(
            lora_alpha=128,
            lora_dropout=0.05,
            r=256,
            bias="none",
            target_modules="all-linear",
            task_type="CAUSAL_LM",
    )

    # Set up training arguments
    from trl import SFTConfig

    max_seq_length = 50 # max sequence length for model and packing of the dataset
    args = SFTConfig(
        output_dir="sarvamai-hathi-highlish-to-english-translator-run-3", # directory to save and repository id
        num_train_epochs=2,                     # number of training epochs
        per_device_train_batch_size=100,          # batch size per device during training
        gradient_accumulation_steps=2,          # number of steps before performing a backward/update pass
        gradient_checkpointing=True,            # use gradient checkpointing to save memory
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=10,                       # log every 10 steps
        save_strategy="epoch",                  # save checkpoint every epoch
        learning_rate=2e-4,                     # learning rate, based on QLoRA paper
        bf16=True,                              # use bfloat16 precision
        tf32=True,                              # use tf32 precision
        max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
        warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
        lr_scheduler_type="constant",           # use constant learning rate scheduler
        push_to_hub=False,                       # push model to hub
        max_seq_length=max_seq_length,
        packing=True
    )

    # Set up SFT trainer

    from trl import SFTTrainer
    
    trainer = SFTTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset_hg,
        peft_config=peft_config,
        tokenizer=tokenizer
    )

    # Train the model

    # start training, the model will be automatically saved to the output directory
    #Uncomment to finetune
    trainer.train()
     
    # save model
    trainer.save_model()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
  trainer = SFTTrainer(


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/10000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


KeyboardInterrupt: 

In [None]:
import torch
from peft import AutoPeftModelForCausalLM, PeftModel
from transformers import AutoTokenizer, pipeline, TranslationPipeline

# model id
base_model_id = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"
peft_model_id = "./sarvamai-hathi-highlish-to-english-translator-run-3/checkpoint-80/" #CHANGE IF FINETUNING YOUR OWN MODEL
# peft_model_id = args.output_dir
 
# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
# load into pipeline
# pipe = TranslationPipeline(model=model, tokenizer=tokenizer)

# pipe = pipeline("translation", model=model, tokenizer=tokenizer)
# pipe.model = PeftModel.from_pretrained(model_id=peft_model_id, tokenizer=tokenizer)

In [None]:
pipe = pipeline("translation_en_to_hi", model=model, tokenizer=tokenizer)

In [None]:
# text = "Mai March mai ghumne jaa raha hu. Tum bhi mere sath chalo."
# prompt = f"translate this text to english: {text}"
# output = pipe(prompt, max_new_tokens=50)

# print(output[0]['translation_text'][len(prompt):])

In [None]:
## Evaluate the model
## This is a very simple evaluation strategy

from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def similarity_func(sentence1, sentence2):
    """
    Calculates the cosine similarity score between two sentences.

    Args:
      sentence1: The first sentence (string).
      sentence2: The second sentence (string).

    Returns:
      The cosine similarity score (float) between the two sentences, ranging from 0 to 1.
    """
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([sentence1, sentence2])
    similarity_score = cosine_similarity(vectors[0], vectors[1])[0][0]
    return similarity_score

def evaluate(sample):
    prompt = f"translate this text to english: {sample['prompt']}"
    output = pipe(prompt, max_new_tokens=50)
    output = output[0]['translation_text'][len(prompt):].strip()
    similarity = similarity_func(output, sample['completion'])
    return similarity

similarity_score = []
number_of_eval_samples = 10

for s in tqdm(test_dataset_hg.shuffle().select(range(number_of_eval_samples))):
    similarity_score.append(evaluate(s))

accuracy = sum(similarity_score)/len(similarity_score)

print(f"Accuracy: {accuracy*100:.2f}%")