<a href="https://colab.research.google.com/github/beyza720/CENG463-Assignment2/blob/main/task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets torch
!pip install evaluate



In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from google.colab import drive

drive.mount('/content/drive')


train_path = '/content/drive/My Drive/Colab Notebooks/power-lv-train.tsv'

train_power_data = pd.read_csv(train_path, sep='\t')


# Class imbalance for power
class_0 = train_power_data[train_power_data['label'] == 0]
class_1 = train_power_data[train_power_data['label'] == 1]

class_1_oversampled = resample(class_1, replace=True, n_samples=len(class_0), random_state=42)

balanced_train_power_data = pd.concat([class_1_oversampled, class_0])


# split the train data into 0.9 train and 0.1 validation
train_power, val_power = train_test_split(
    balanced_train_power_data,
    test_size=0.1,
    stratify=balanced_train_power_data['label'],
    random_state=42
)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from transformers import TrainingArguments
import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer


tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2
)

train_dataset = Dataset.from_pandas(train_power)
val_dataset = Dataset.from_pandas(val_power)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(output_dir="test_trainer")

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=107,
    save_steps=107,
    save_total_limit=3
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1699 [00:00<?, ? examples/s]

Map:   0%|          | 0/189 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6735,0.644434,0.666667,0.666667,0.666667,0.666648
2,0.6214,0.580226,0.714286,0.724645,0.714286,0.711215
3,0.5184,0.521376,0.751323,0.751959,0.751323,0.751114


TrainOutput(global_step=321, training_loss=0.6044185079889506, metrics={'train_runtime': 123.6062, 'train_samples_per_second': 41.236, 'train_steps_per_second': 2.597, 'total_flos': 1341077049169920.0, 'train_loss': 0.6044185079889506, 'epoch': 3.0})

In [66]:
# the following part is for the casual language model, I fine-tune this model also  to get better results

In [5]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# text
train_encodings = tokenizer(list(train_power['text']), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_power['text']), truncation=True, padding=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = Dataset(train_encodings, train_power['label'].tolist())
val_dataset = Dataset(val_encodings, val_power['label'].tolist())

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # save_strategy, evaluation_strategy ile aynı olmalı
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

val_preds = trainer.predict(val_dataset)
predicted_labels = torch.argmax(torch.tensor(val_preds.predictions), axis=1).numpy()
true_labels = val_power['label'].tolist()

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

import pandas as pd
performance_table = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "Value": [accuracy, precision, recall, f1]
})

print("\nModel: XLM-RoBERTa Base")
print(performance_table.to_string(index=False))

print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6794,0.685264
2,0.672,0.619751
3,0.625,0.576984



Model: XLM-RoBERTa Base
   Metric    Value
 Accuracy 0.693122
Precision 0.697216
   Recall 0.693122
 F1 Score 0.691306

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.77      0.72        95
           1       0.72      0.62      0.67        94

    accuracy                           0.69       189
   macro avg       0.70      0.69      0.69       189
weighted avg       0.70      0.69      0.69       189



In [8]:
# text_en
train_encodings_text_en = tokenizer(list(train_power['text_en']), truncation=True, padding=True, max_length=512)
val_encodings_text_en = tokenizer(list(val_power['text_en']), truncation=True, padding=True, max_length=512)

train_dataset_text_en = Dataset(train_encodings_text_en, train_power['label'].tolist())
val_dataset_text_en = Dataset(val_encodings_text_en, val_power['label'].tolist())

trainer_text_en = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_text_en,
    eval_dataset=val_dataset_text_en,
    tokenizer=tokenizer
)

trainer_text_en.train()

val_preds_text_en = trainer_text_en.predict(val_dataset_text_en)
predicted_labels_text_en = torch.argmax(torch.tensor(val_preds_text_en.predictions), axis=1).numpy()
true_labels_text_en = val_power['label'].tolist()

accuracy_text_en = accuracy_score(true_labels_text_en, predicted_labels_text_en)
precision_text_en = precision_score(true_labels_text_en, predicted_labels_text_en, average='weighted')
recall_text_en = recall_score(true_labels_text_en, predicted_labels_text_en, average='weighted')
f1_text_en = f1_score(true_labels_text_en, predicted_labels_text_en, average='weighted')

print("\nModel: XLM-RoBERTa Base for 'text_en'")
print("Accuracy:", accuracy_text_en)
print("Precision:", precision_text_en)
print("Recall:", recall_text_en)
print("F1 Score:", f1_text_en)

print("\nClassification Report for 'text_en':")
print(classification_report(true_labels_text_en, predicted_labels_text_en))

  trainer_text_en = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5454,0.560316
2,0.4242,0.556599
3,0.3583,0.421035



Model: XLM-RoBERTa Base for 'text_en'
Accuracy: 0.873015873015873
Precision: 0.8731688402239726
Recall: 0.873015873015873
F1 Score: 0.8729945383171189

Classification Report for 'text_en':
              precision    recall  f1-score   support

           0       0.87      0.88      0.88        95
           1       0.88      0.86      0.87        94

    accuracy                           0.87       189
   macro avg       0.87      0.87      0.87       189
weighted avg       0.87      0.87      0.87       189

