In [19]:
import pandas as pd
import numpy as np
import evaluate
import matplotlib as pt

# Third-party library imports
from datasets import Dataset, DatasetDict, load_dataset
from evaluate import load

In [None]:
percent_data_select = "train[:100%]" # add percent sign ie. "train[:20%]" to select that percent of data 
# Load only 20% of the dataset
dataset = load_dataset("csv", data_files={"train": "../Datasets/WikiMatrix/Processed/clean_en-hi.csv"}, split=percent_data_select)

# Split into train and test sets (e.g., 80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the test set into validation and test (e.g., 50-50 split of the 20%)
validation_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)

# Combine splits into a DatasetDict
raw_dataset = {
    "train": train_test_split["train"],
    "validation": validation_test_split["train"],
    "test": validation_test_split["test"]
}

dataset = DatasetDict(raw_dataset)

# Inspect the resulting dataset
print(dataset)


In [21]:
# Empty VRAM cache
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
from transformers import DataCollatorForSeq2Seq

from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig,AutoTokenizer

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for better accuracy
    bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
    bnb_4bit_compute_dtype=torch.float16  # Use FP16 for computation
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/m2m100_418M",
    quantization_config=quantization_config
)

tokenizer = AutoTokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="hi")



In [23]:

def preprocess_function(examples, src_lang, tgt_lang):
    inputs = [f"translate {src_lang} to {tgt_lang}: " + ex for ex in examples[src_lang]]
    targets = examples[tgt_lang]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_datasets_english_to_hindi = dataset.map(lambda x: preprocess_function(x, "English", "Hindi"), batched=True)

In [25]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Target specific layers
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"  # Task type for sequence-to-sequence models
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    evaluation_strategy="epoch",  # Evaluate after each epoch
    save_strategy="epoch",  # Save after each epoch (match evaluation strategy)
    num_train_epochs=10,
    learning_rate=2e-5,
    warmup_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.1,
    gradient_accumulation_steps=4,
    fp16=True,
    logging_steps=10,
    lr_scheduler_type="linear",  # Linear decay after warmup
    metric_for_best_model="eval_loss",
    predict_with_generate=True,
    report_to=None,  # Or "wandb" if integrated
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets_english_to_hindi["train"],
    eval_dataset=tokenized_datasets_english_to_hindi["validation"],
)

trainer.train()

In [None]:
trainer.save_model("../Model/lora/M2M100/")
tokenizer.save_pretrained("../Model/lora/M2M100/")


In [None]:
from transformers import pipeline
text = 'break a leg'
translator = pipeline("translation_en_to_hi", model="../Model/lora/M2M100/")
translator(text)



In [None]:
model.push_to_hub("aktheroy/translate_en_hi")