In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from pandas import DataFrame
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path="base_models/granite-3.2-2b-instruct"
device= "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
tokenizer("input", "input", "input", "inputs", return_tensors="pt")

NameError: name 'tokenizer' is not defined

In [103]:
from datasets import Dataset, DatasetDict

with open("data/coco.ml.txt") as f:
    ml = f.readlines()

with open("data/coco.en.txt") as f:
    eng = f.readlines()

def get_dataset(ml, eng):
    ml = [sen.strip() for sen in ml]
    eng = [sen.strip() for sen in eng]
    return [{"ml": ml, "eng": eng, "content": f'Translate to english:<|end_of_text|>{ml}<|end_of_text|>{eng}<|end_of_text|>'} for ml, eng in zip(ml, eng)]

dataset = get_dataset(ml, eng)
n = len(dataset)
train_dataset = Dataset.from_list(dataset[:n // 10 * 8])
valid_dataset = Dataset.from_list(dataset[n // 10 * 8:n])
dataset = DatasetDict({"train": train_dataset, "validation": valid_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['ml', 'eng', 'content'],
        num_rows: 289464
    })
    validation: Dataset({
        features: ['ml', 'eng', 'content'],
        num_rows: 72368
    })
})

In [4]:
from transformers import Trainer, TrainingArguments
args = TrainingArguments("ml_to_en")

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["content"], padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ml', 'eng', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['ml', 'eng', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
})

In [49]:
len(tokenized_datasets['train'][0]['input_ids'])

158

In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()


KeyboardInterrupt



In [6]:
trainer.predict(tokenized_datasets['validation'])[0]

Step,Training Loss


array([[[ -5.53125   , -15.375     , -15.375     , ...,  -6.40625   ,
          -5.34375   ,  -6.75      ],
        [  5.3125    ,  -3.59375   ,  -3.59375   , ...,  -0.39453125,
          -0.2578125 ,  -0.6953125 ],
        [  9.125     ,  -5.15625   ,  -5.15625   , ...,   0.16210938,
           0.87890625,  -0.90234375],
        ...,
        [ 18.        ,  -4.6875    ,  -4.6875    , ...,  -0.08984375,
           0.63671875,  -0.39453125],
        [ 16.875     ,  -5.6875    ,  -5.6875    , ...,  -0.54296875,
          -0.05981445,  -0.875     ],
        [ 16.375     ,  -4.8125    ,  -4.8125    , ...,  -0.37890625,
          -0.03112793,  -0.60546875]],

       [[ -5.53125   , -15.375     , -15.375     , ...,  -6.40625   ,
          -5.34375   ,  -6.75      ],
        [  5.3125    ,  -3.59375   ,  -3.59375   , ...,  -0.39453125,
          -0.2578125 ,  -0.6953125 ],
        [  9.125     ,  -5.15625   ,  -5.15625   , ...,   0.16210938,
           0.87890625,  -0.90234375],
        ...,


In [8]:
pred = trainer.predict(tokenized_datasets['validation'])

In [30]:
type(pred.predictions)

numpy.ndarray

In [35]:
from nltk import bleu_score
import numpy as np

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    # labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    bleu = bleu_score.corpus_bleu(label_str, pred_str, smoothing_function=bleu_score.SmoothingFunction().method7)
    return {"bleu": bleu}

compute_metrics(pred)

(20, 144) (20, 144)


{'bleu': 0.09299390537329787}

In [53]:
ml_test = 'എയർപോർട്ട് ടാർമാക്കിന്റെ മുകളിൽ ഇരിക്കുന്ന ഒരു വലിയ വെളുത്ത ജെറ്റ്'
ip = tokenizer(f'Translate to english:<|end_of_text|>{ml_test}<|end_of_text|>', return_tensors="pt").to(device)
op = model.generate(**ip, max_length=200)
tokenizer.decode(op[0])

'Translate to english:<|end_of_text|>എയർപോർട്ട് ടാർമാക്കിന്റെ മുകളിൽ ഇരിക്കുന്ന ഒരു വലിയ വെളുത്ത ജെറ്റ്<|end_of_text|>\n\nA large jet engine in the rear of the Eyroport Tramack.<|end_of_text|>'

In [75]:
model_path_ours = "base_models/granite-3.2-2b-instruct"

model_ours = AutoModelForCausalLM.from_pretrained(
    model_path_ours,
    device_map=device,
    torch_dtype=torch.bfloat16,
)
tokenizer_ours = AutoTokenizer.from_pretrained(
    model_path_ours,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
num_test = 2

ml_test = dataset['validation']['ml'][:num_test]
eng_test = dataset['validation']['eng'][:num_test]

ip_content = [f'Translate to english:<|end_of_text|>{test}<|end_of_text|>' for test in ml_test]
ip = tokenizer_ours(ip_content, return_tensors="pt", padding=True).to(device)
ip_base = tokenizer(ip_content, return_tensors="pt", padding=True).to(device)

ip_length = ip['input_ids'].shape[-1]

op = model_ours.generate(**ip, max_length=200)
op_base = model.generate(**ip_base, max_length=200)

In [98]:
.shape

torch.Size([2, 18])

In [92]:
cop = op[:, ip['input_ids'].shape[-1]:]

(tensor([16076,   372, 40230,    44,     0,  8747,   232,  8747,   116, 39770,
          8747,   133,  8747,   126, 41125,  8747,   127,  8747,   125,  8747,
           141,  8747,   126, 39770, 27251,   116, 13525,   238, 49057, 21185,
         49057, 45556, 49057, 21185, 49057, 27251,   245,  8747,   127, 45556,
         27251,   124,  8747,   141,  8747,   136, 21185,  8747,   126, 27251,
           135, 21185,  8747,   117,  8747,   129,  8747,   116, 21185,  8747,
           116, 39770, 49057, 21185,  8747,   128, 13525,   233, 27251,   253,
         39770,  8747,   116, 21185,  8747,   127,  8747,   229,    32,     0,
           203,    51,   343,  4325,   689,   409,  1295,   382,   432,   312,
         28805,    30,   408,   837,    86,  5122,    32,     0],
        device='mps:0'),
 tensor([16076,   372, 40230,    44,     0,  8747,   232,  8747,   116, 39770,
          8747,   133,  8747,   126, 41125,  8747,   127,  8747,   125,  8747,
           141,  8747,   126, 39770, 272

In [100]:
cop = op[:, ip['input_ids'].shape[-1] + 1:]
cop_base = op_base[:, ip['input_ids'].shape[-1] + 1:]

# get only generated part
op_str = tokenizer_ours.batch_decode(cop, skip_special_tokens=True)
op_base_str = tokenizer.batch_decode(cop_base, skip_special_tokens=True)

bleu = [bleu_score.sentence_bleu([eng], op, smoothing_function=bleu_score.SmoothingFunction().method7) * 100 for eng, op in zip(eng_test, op_str)]
bleu_base = [bleu_score.sentence_bleu([eng], op, smoothing_function=bleu_score.SmoothingFunction().method7) * 100 for eng, op in zip(eng_test, op_base_str)]

import pandas as pd
df = pd.DataFrame({"ml": ml_test, "eng": eng_test, "op": op_str, "op_base": op_base_str, "bleu": bleu, "bleu_base": bleu_base})
df.to_csv("base_finetune_mt.csv")

Unnamed: 0,ml,eng,op,op_base,bleu,bleu_base
0,ഒരു നഗരത്തിലെ പാർക്കിംഗിനായി ഒരു കുട്ടി പണം നൽകുന്നു.,A child pays for parking in a city.,A girl in a city earns money by selling her body.,A girl in a city earns money by selling her body.,38.737708,38.737708
1,അതിശയകരമായി തോന്നുന്ന ഒരു ബാഹ്യ സ്ഥലത്തിന്റെ ചിത്രം.,Picture of an exterior place that looks wonderful.,"A dramatic depiction of a quiet, secluded space.","A dramatic depiction of a quiet, secluded space.",27.195199,27.195199


In [104]:
train_df = pd.DataFrame(dataset['train'])
print(len(train_df))
# deduplication
train_df = train_df.drop_duplicates()
print(len(train_df))

289464
270880


In [105]:
val_df = pd.DataFrame(dataset['validation'])
print(len(val_df))
# deduplication
val_df = val_df.drop_duplicates()
print(len(val_df))

72368
71043


In [106]:
# combined
train2_df = pd.DataFrame(dataset['train'])
val2_df = pd.DataFrame(dataset['validation'])
full_df = pd.concat([train2_df, val2_df])
print(len(full_df))
# deduplication
full_df = full_df.drop_duplicates()
print(len(full_df))

361832
341149


In [109]:
361832 - 341149 - (289464 - 270880) - (72368 - 71043)

774