In [1]:
pip install 'accelerate>=0.26.0' 'transformers[torch]'


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import BertTokenizer, BertModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

orientation_training_data = pd.read_csv("orientation-it-train.tsv", sep="\t")
power_training_data = pd.read_csv("power-it-train.tsv", sep="\t")

print("Orientation Data Sample:")
print(orientation_training_data.head())

print("\nPower Data Sample:")
print(power_training_data.head())

In [3]:
orientation_train, orientation_validate = train_test_split(orientation_training_data, train_size=0.9, test_size=0.1, stratify=orientation_training_data['label'], random_state=42)
power_train, power_validate = train_test_split(power_training_data, train_size=0.9, test_size=0.1, stratify=power_training_data['label'], random_state=42)

print("Orientation Train Data Sample:")
print(orientation_train.head())

print("Orientation Validate Data Sample:")
print(orientation_validate.head())

print("Power Train Data Sample:")
print(power_train.head())

print("Power Validate Data Sample:")
print(power_validate.head())


In [4]:

model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def tokenize_data(data, column):
    return tokenizer(data[column].tolist(), padding='max_length', truncation=True)

orientation_train_encodings_en = tokenize_data(orientation_train, 'text_en')
orientation_validate_encodings_en = tokenize_data(orientation_validate, 'text_en')
orientation_train_encodings_it = tokenize_data(orientation_train, 'text')
orientation_validate_encodings_it = tokenize_data(orientation_validate, 'text')

power_train_encodings_en = tokenize_data(power_train, 'text_en')
power_validate_encodings_en = tokenize_data(power_validate, 'text_en')
power_train_encodings_it = tokenize_data(power_train, 'text')
power_validate_encodings_it = tokenize_data(power_validate, 'text')

In [5]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [6]:
orientation_train_dataset_en = Dataset(orientation_train_encodings_en, orientation_train['label'].tolist())
orientation_validate_dataset_en = Dataset(orientation_validate_encodings_en, orientation_validate['label'].tolist())
orientation_train_dataset_it = Dataset(orientation_train_encodings_it, orientation_train['label'].tolist())
orientation_validate_dataset_it = Dataset(orientation_validate_encodings_it, orientation_validate['label'].tolist())

power_train_dataset_en = Dataset(power_train_encodings_en, power_train['label'].tolist())
power_validate_dataset_en = Dataset(power_validate_encodings_en, power_validate['label'].tolist())
power_train_dataset_it = Dataset(power_train_encodings_it, power_train['label'].tolist())
power_validate_dataset_it = Dataset(power_validate_encodings_it, power_validate['label'].tolist())

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer_orientation_en = Trainer(
    model=model,
    args=training_args,
    train_dataset=orientation_train_dataset_en,
    eval_dataset=orientation_validate_dataset_en
)

trainer_orientation_it = Trainer(
    model=model,
    args=training_args,
    train_dataset=orientation_train_dataset_it,
    eval_dataset=orientation_validate_dataset_it
)

trainer_power_en = Trainer(
    model=model,
    args=training_args,
    train_dataset=power_train_dataset_en,
    eval_dataset=power_validate_dataset_en
)

trainer_power_it = Trainer(
    model=model,
    args=training_args,
    train_dataset=power_train_dataset_it,
    eval_dataset=power_validate_dataset_it
)

trainer_orientation_en.train()
trainer_orientation_it.train()
trainer_power_en.train()
trainer_power_it.train()


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

orientation_eval_results_en = trainer_orientation_en.evaluate()
orientation_eval_results_it = trainer_orientation_it.evaluate()
power_eval_results_en = trainer_power_en.evaluate()
power_eval_results_it = trainer_power_it.evaluate()

print("Orientation English Evaluation Results:", orientation_eval_results_en)
print("Orientation Italian Evaluation Results:", orientation_eval_results_it)
print("Power English Evaluation Results:", power_eval_results_en)
print("Power Italian Evaluation Results:", power_eval_results_it)