In [2]:
from datasets import Dataset, DatasetDict
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_path="base_models/granite-3.2-2b-instruct"
device= "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_path
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
tokenizer("input", "input", "input", "inputs", return_tensors="pt")

NameError: name 'tokenizer' is not defined

In [3]:
from datasets import Dataset, DatasetDict

with open("data/coco.ml.txt") as f:
    ml = f.readlines()

with open("data/coco.en.txt") as f:
    eng = f.readlines()

def get_dataset(ml, eng):
    ml = [sen.strip() for sen in ml]
    eng = [sen.strip() for sen in eng]
    return [{"ml": ml, "eng": eng, "content": f'Translate to english:<|end_of_text|>{ml}<|end_of_text|>{eng}<|end_of_text|>'} for ml, eng in zip(ml, eng)]

n = 100
dataset = get_dataset(ml, eng)
train_dataset = Dataset.from_list(dataset[:n // 10 * 8])
valid_dataset = Dataset.from_list(dataset[n // 10 * 8:n])
dataset = DatasetDict({"train": train_dataset, "validation": valid_dataset})
dataset

DatasetDict({
    train: Dataset({
        features: ['ml', 'eng', 'content'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['ml', 'eng', 'content'],
        num_rows: 20
    })
})

In [4]:
from transformers import Trainer, TrainingArguments
args = TrainingArguments("ml_to_en")

def tokenize_function(examples):
    tokenized_inputs = tokenizer(examples["content"], padding=True, truncation=True, return_tensors="pt")
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ml', 'eng', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 80
    })
    validation: Dataset({
        features: ['ml', 'eng', 'content', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 20
    })
})

In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()


KeyboardInterrupt



In [6]:
trainer.predict(tokenized_datasets['validation'])[0]

Step,Training Loss


array([[[ -5.53125   , -15.375     , -15.375     , ...,  -6.40625   ,
          -5.34375   ,  -6.75      ],
        [  5.3125    ,  -3.59375   ,  -3.59375   , ...,  -0.39453125,
          -0.2578125 ,  -0.6953125 ],
        [  9.125     ,  -5.15625   ,  -5.15625   , ...,   0.16210938,
           0.87890625,  -0.90234375],
        ...,
        [ 18.        ,  -4.6875    ,  -4.6875    , ...,  -0.08984375,
           0.63671875,  -0.39453125],
        [ 16.875     ,  -5.6875    ,  -5.6875    , ...,  -0.54296875,
          -0.05981445,  -0.875     ],
        [ 16.375     ,  -4.8125    ,  -4.8125    , ...,  -0.37890625,
          -0.03112793,  -0.60546875]],

       [[ -5.53125   , -15.375     , -15.375     , ...,  -6.40625   ,
          -5.34375   ,  -6.75      ],
        [  5.3125    ,  -3.59375   ,  -3.59375   , ...,  -0.39453125,
          -0.2578125 ,  -0.6953125 ],
        [  9.125     ,  -5.15625   ,  -5.15625   , ...,   0.16210938,
           0.87890625,  -0.90234375],
        ...,


In [8]:
pred = trainer.predict(tokenized_datasets['validation'])

In [30]:
type(pred.predictions)

numpy.ndarray

In [35]:
from nltk import bleu_score
import numpy as np

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    # labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    bleu = bleu_score.corpus_bleu(label_str, pred_str, smoothing_function=bleu_score.SmoothingFunction().method7)
    return {"bleu": bleu}

compute_metrics(pred)

(20, 144) (20, 144)


{'bleu': 0.09299390537329787}