In [16]:
from dotenv import load_dotenv
import numpy as np
import torch
from typing import Tuple, List
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from datasets import load_dataset, DatasetDict
import evaluate
load_dotenv()

True

# Finetuning for Classification

In [None]:
# Get the data ready
raw_datasets = load_dataset("glue", "mrpc")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

In [70]:
# Get the model (and trainer) ready
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

training_args = TrainingArguments(
    "test_trainer",
    num_train_epochs=1
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"].select(range(2_000)),
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    data_collator=data_collator,
    tokenizer=tokenizer
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Evaluate the outputs before training
predictions = trainer.predict(tokenized_datasets["test"].select(range(50)))
print("Predictions shapes: ", predictions.predictions.shape, predictions.label_ids.shape)

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Convert logits to predicted class
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Compute accuracy
accuracy = accuracy_metric.compute(predictions=preds, references=predictions.label_ids)

print(f"Accuracy (before training): {accuracy['accuracy']}")

In [71]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=250, training_loss=0.6276907348632812, metrics={'train_runtime': 881.0138, 'train_samples_per_second': 2.27, 'train_steps_per_second': 0.284, 'total_flos': 76474872684480.0, 'train_loss': 0.6276907348632812, 'epoch': 1.0})

In [69]:
# Evaluate the outputs
predictions = trainer.predict(tokenized_datasets["test"].select(range(50)))
print("Predictions shapes: ", predictions.predictions.shape, predictions.label_ids.shape)

# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Convert logits to predicted class
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Compute accuracy
accuracy = accuracy_metric.compute(predictions=preds, references=predictions.label_ids)

print(f"Accuracy (after training): {accuracy['accuracy']}")

Predictions shapes:  (50, 2) (50,)
Accuracy: 0.68


In [56]:
# Inference mode
def inference(model, sentences: List[Tuple]):
    sentences = [sentences] if not isinstance(sentences, list) else sentences
    output_list = []
    
    for s1, s2 in sentences:
        inputs = tokenizer(s1, s2, return_tensors="pt", padding=True, truncation=True)
        
        # Make predictions
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
        
        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=1)
        
        # Get the predicted class
        predicted_class = torch.argmax(probs, dim=1).item()
        
        # Save the outputs
        output_list.append((predicted_class, probs.tolist()[0]))
    
    return output_list

# Example input strings
sentences = [
    ("I am called Tom. Tom is hungry. Tom wants to eat fish.", "Mick is thirsty."),
    ("I am called Tom. Tom is hungry. Tom wants to eat fish.", "Tom is hungry.")
]

inference(model, sentences)

[(0, [0.6112167835235596, 0.3887832760810852]),
 (1, [0.4407484531402588, 0.5592515468597412])]

# 2. Finetuning for Regression (not tested; no space on disk)

In [17]:
# Get the data ready
raw_datasets = load_dataset("gretelai/synthetic_text_to_sql")

# Split the training set into training and validation sets
train_test_split = raw_datasets["train"].train_test_split(test_size=0.1)

# Create a new DatasetDict including the new split
datasets = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],
    'test': raw_datasets['test']
})

# Get the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
def tokenize_function(examples):
    inputs = tokenizer(examples["sql_prompt"], truncation=True, padding="max_length", max_length=512)
    targets = tokenizer(examples["sql"], truncation=True, padding="max_length", max_length=512)
    inputs["label"] = targets["input_ids"]
    return inputs

# Perform tokenization
tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer)

Map:   0%|          | 0/90000 [00:00<?, ? examples/s]

OSError: [Errno 28] No space left on device

In [5]:
# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")

training_args = TrainingArguments(
    "test_trainer",
    num_train_epochs=1
)

trainer = Trainer(
    model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].select(range(10)),
    eval_dataset=tokenized_datasets["validation"].select(range(10)),
    data_collator=data_collator,
    tokenizer=tokenizer
)



pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [5]:
# Evaluate the outputs before training
predictions = trainer.predict(tokenized_datasets["test"].select(range(5)))
# print("Predictions shapes: ", predictions.predictions.shape, predictions.label_ids.shape)

# Load the accuracy metric
accuracy_metric = evaluate.load("bleu")

# Convert logits to predicted class
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Compute accuracy
bleu = accuracy_metric.compute(predictions=preds, references=predictions.label_ids)

print(f"Bleu (before training): {bleu['bleu']}")

  preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()


ValueError: expected sequence of length 50265 at dim 3 (got 768)

In [90]:
# Train the model
trainer.train()

Step,Training Loss


TrainOutput(global_step=2, training_loss=15.858112335205078, metrics={'train_runtime': 59.6909, 'train_samples_per_second': 0.168, 'train_steps_per_second': 0.034, 'total_flos': 3048682291200.0, 'train_loss': 15.858112335205078, 'epoch': 1.0})

In [None]:
# Evaluate the outputs after training
predictions = trainer.predict(tokenized_datasets["test"].select(range(5)))

# Load the accuracy metric
accuracy_metric = evaluate.load("bleu")

# Convert logits to predicted class
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Compute accuracy
bleu = accuracy_metric.compute(predictions=preds, references=predictions.label_ids)

print(f"Bleu (after training): {bleu['bleu']}")

In [97]:
# This is to decode for inference

# Decode the predictions and labels
decoded_predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.predictions]
decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in predictions.label_ids]

# Print the results
for i in range(len(decoded_predictions)):
    print(f"Prediction: {decoded_predictions[i]}")
    print(f"Label: {decoded_labels[i]}")

array([[    0, 49179, 44619, ...,     1,     1,     1],
       [    0, 10089,  3850, ...,     1,     1,     1],
       [    0, 49179,  1709, ...,     1,     1,     1],
       ...,
       [    0, 49179,  3893, ...,     1,     1,     1],
       [    0, 49179,  1484, ...,     1,     1,     1],
       [    0, 49179, 32464, ...,     1,     1,     1]], dtype=int64)