<a href="https://colab.research.google.com/github/beyza720/CENG463-Assignment2/blob/main/task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers datasets torch
!pip install evaluate



In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

from google.colab import drive
drive.mount('/content/drive')

train_path = '/content/drive/My Drive/Colab Notebooks/orientation-lv-train.tsv'
train_orientation_data = pd.read_csv(train_path, sep='\t')

class_0 = train_orientation_data[train_orientation_data['label'] == 0]
class_1 = train_orientation_data[train_orientation_data['label'] == 1]

class_0_oversampled = resample(class_0, replace=True, n_samples=len(class_1), random_state=42)
balanced_train_orientation_data = pd.concat([class_1, class_0_oversampled])

train_orientation, val_orientation = train_test_split(
    balanced_train_orientation_data,
    test_size=0.1,
    stratify=balanced_train_orientation_data['label'],
    random_state=42
)

Mounted at /content/drive


In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import Dataset
from transformers import TrainingArguments
import numpy as np
import evaluate
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import Trainer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-cased", num_labels=2
)

train_dataset = Dataset.from_pandas(train_orientation)
val_dataset = Dataset.from_pandas(val_orientation)

def tokenize_function(examples):
    return tokenizer(examples["text_en"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=71,
    save_steps=500,
    save_total_limit=2
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1130 [00:00<?, ? examples/s]

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5795,0.452495,0.793651,0.813897,0.793651,0.790269
2,0.3118,0.250048,0.912698,0.915314,0.912698,0.912561
3,0.1641,0.227607,0.928571,0.929545,0.928571,0.928531


TrainOutput(global_step=213, training_loss=0.3518251365339252, metrics={'train_runtime': 78.9465, 'train_samples_per_second': 42.94, 'train_steps_per_second': 2.698, 'total_flos': 891946477670400.0, 'train_loss': 0.3518251365339252, 'epoch': 3.0})

In [5]:
# the following part is for the casual language model, I fine-tune this model also  to get better results

In [6]:
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# text
train_encodings = tokenizer(list(train_orientation['text']), truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(list(val_orientation['text']), truncation=True, padding=True, max_length=512)

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = Dataset(train_encodings, train_orientation['label'].tolist())
val_dataset = Dataset(val_encodings, val_orientation['label'].tolist())

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # save_strategy, evaluation_strategy ile aynı olmalı
    report_to="none",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

val_preds = trainer.predict(val_dataset)
predicted_labels = torch.argmax(torch.tensor(val_preds.predictions), axis=1).numpy()
true_labels = val_orientation['label'].tolist()

accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted')
recall = recall_score(true_labels, predicted_labels, average='weighted')
f1 = f1_score(true_labels, predicted_labels, average='weighted')

import pandas as pd
performance_table = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "Value": [accuracy, precision, recall, f1]
})

print("\nModel: XLM-RoBERTa Base")
print(performance_table.to_string(index=False))

print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6706,0.606217
2,0.5898,0.301456
3,0.2768,0.196116



Model: XLM-RoBERTa Base
   Metric    Value
 Accuracy 0.944444
Precision 0.944556
   Recall 0.944444
 F1 Score 0.944441

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94        63
           1       0.94      0.95      0.94        63

    accuracy                           0.94       126
   macro avg       0.94      0.94      0.94       126
weighted avg       0.94      0.94      0.94       126



In [8]:
# text_en

train_encodings_text_en = tokenizer(list(train_orientation['text_en']), truncation=True, padding=True, max_length=512)
val_encodings_text_en = tokenizer(list(val_orientation['text_en']), truncation=True, padding=True, max_length=512)

train_dataset_text_en = Dataset(train_encodings_text_en, train_orientation['label'].tolist())
val_dataset_text_en = Dataset(val_encodings_text_en, val_orientation['label'].tolist())

trainer_text_en = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_text_en,
    eval_dataset=val_dataset_text_en,
    tokenizer=tokenizer
)

trainer_text_en.train()

val_preds_text_en = trainer_text_en.predict(val_dataset_text_en)
predicted_labels_text_en = torch.argmax(torch.tensor(val_preds_text_en.predictions), axis=1).numpy()
true_labels_text_en = val_orientation['label'].tolist()

accuracy_text_en = accuracy_score(true_labels_text_en, predicted_labels_text_en)
precision_text_en = precision_score(true_labels_text_en, predicted_labels_text_en, average='weighted')
recall_text_en = recall_score(true_labels_text_en, predicted_labels_text_en, average='weighted')
f1_text_en = f1_score(true_labels_text_en, predicted_labels_text_en, average='weighted')

print("\nModel: XLM-RoBERTa Base for 'text_en'")
print("Accuracy:", accuracy_text_en)
print("Precision:", precision_text_en)
print("Recall:", recall_text_en)
print("F1 Score:", f1_text_en)

print("\nClassification Report for 'text_en':")
print(classification_report(true_labels_text_en, predicted_labels_text_en))


  trainer_text_en = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6352,0.316832
2,0.1613,0.17682
3,0.1788,0.150649



Model: XLM-RoBERTa Base for 'text_en'
Accuracy: 0.9682539682539683
Precision: 0.9687263556116016
Recall: 0.9682539682539683
F1 Score: 0.9682459677419355

Classification Report for 'text_en':
              precision    recall  f1-score   support

           0       0.98      0.95      0.97        63
           1       0.95      0.98      0.97        63

    accuracy                           0.97       126
   macro avg       0.97      0.97      0.97       126
weighted avg       0.97      0.97      0.97       126

