# CLEF2023 Task 2: Subjectivity in news articles

In [9]:
# All imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import ParameterGrid
from tabulate import tabulate
import torch


In [10]:
# !pip install transformers[torch] datasets

In [11]:

# Reading the dataset
train_data = pd.read_csv('./data/subtask-2-english/train_en.tsv', delimiter='\t')
validation_data = pd.read_csv('./data/subtask-2-english/dev_en.tsv', delimiter='\t')
testing_data = pd.read_csv('./data/subtask-2-english/test_en.tsv', delimiter='\t')
testing_data_gold = pd.read_csv('./data/subtask-2-english/test_en_gold.tsv', delimiter='\t')


train_data['label'] = train_data['label'].replace({'SUBJ': 0, 'OBJ': 1})
validation_data['label'] = validation_data['label'].replace({'SUBJ': 0, 'OBJ': 1})
testing_data_gold['label'] = testing_data_gold['label'].replace({'SUBJ': 0, 'OBJ': 1})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
testing_dataset = Dataset.from_pandas(testing_data)
testing_data_gold = Dataset.from_pandas(testing_data_gold)

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': testing_data_gold,
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict'],
        num_rows: 830
    })
    validation: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict'],
        num_rows: 219
    })
    test: Dataset({
        features: ['sentence_id', 'sentence', 'label'],
        num_rows: 243
    })
})


In [15]:
from transformers import AutoTokenizer, RobertaTokenizer
import numpy as np

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-uncased")

OSError: roberta-uncased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [6]:
def tokenize_function(examples):

    """ This function tokenizes the text in the examples dictionary.
        We pass it to the map function of the dataset so that we can batch the tokenization for efficiency by
        tokenizing batches in parallel.
    """
    return roberta_tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [7]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

In [12]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 830
    })
    validation: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 219
    })
    test: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 243
    })
})

In [8]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize numerical features in the training set
train_features = scaler.fit_transform(tokenized_datasets["train"]["input_ids"])

# Apply the same transformation to the validation and test sets
validation_features = scaler.transform(tokenized_datasets["validation"]["input_ids"])
test_features = scaler.transform(tokenized_datasets["test"]["input_ids"])


In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [16]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [15]:
!pip install evaluate
# 1.

import numpy as np
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 2.

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.0 responses-0.18.0


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [18]:
trainer.args.device

device(type='cuda', index=0)

In [19]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.753838,0.621399,0.707006,0.560606,0.956897
2,No log,0.707929,0.716049,0.749091,0.647799,0.887931
3,No log,0.729368,0.73251,0.718615,0.721739,0.715517
4,No log,0.987657,0.711934,0.75,0.640244,0.905172
5,No log,1.039293,0.748971,0.728889,0.752294,0.706897
6,No log,1.181227,0.744856,0.752,0.701493,0.810345
7,No log,1.359469,0.707819,0.732075,0.651007,0.836207
8,No log,1.370892,0.728395,0.744186,0.676056,0.827586
9,No log,1.352499,0.736626,0.746032,0.691176,0.810345
10,0.164100,1.368633,0.736626,0.746032,0.691176,0.810345


TrainOutput(global_step=520, training_loss=0.1579105357854412, metrics={'train_runtime': 894.9212, 'train_samples_per_second': 9.275, 'train_steps_per_second': 0.581, 'total_flos': 2183821759488000.0, 'train_loss': 0.1579105357854412, 'epoch': 10.0})

In [20]:
trainer.evaluate()

{'eval_loss': 0.7079294323921204,
 'eval_accuracy': 0.7160493827160493,
 'eval_f1': 0.7490909090909091,
 'eval_precision': 0.6477987421383647,
 'eval_recall': 0.8879310344827587,
 'eval_runtime': 7.6856,
 'eval_samples_per_second': 31.617,
 'eval_steps_per_second': 4.033,
 'epoch': 10.0}

In [21]:
trainer.evaluate(tokenized_datasets["validation"])

{'eval_loss': 0.6290113925933838,
 'eval_accuracy': 0.730593607305936,
 'eval_f1': 0.7591836734693876,
 'eval_precision': 0.6690647482014388,
 'eval_recall': 0.8773584905660378,
 'eval_runtime': 6.9539,
 'eval_samples_per_second': 31.493,
 'eval_steps_per_second': 4.026,
 'epoch': 10.0}

In [22]:
pt_save_directory = "./pt_save_pretrained"
roberta_tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)

In [23]:
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]
val_dataset = tokenized_datasets["train"]
first_10_samples = train_dataset.select(range(10))
first_10_val = val_dataset.select(range(10))
first_10_test = test_dataset.select(range(10))
first_10_samples

Dataset({
    features: ['sentence_id', 'sentence', 'label', 'solved_conflict', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [None]:

torch.backends.cuda.max_split_size_mb = 1

param_grid = {
    'learning_rate': [1e-5, 2e-5],
    'per_device_train_batch_size': [8, 16],
    'num_train_epochs': [10],
}
model_grid = ParameterGrid(param_grid)

data = []
head = ['learning_rate', 'per_device_train_batch_size', 'num_train_epochs', 'score in validation set']
trainers = []

for param in model_grid:
    torch.cuda.empty_cache()
    training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    **param
    )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    )

    trainer.train()
    score = trainer.evaluate(tokenized_datasets["validation"])

    list_entry = [param['learning_rate'], param['per_device_train_batch_size'], param['num_train_epochs'], score]
    data.insert(0, list_entry)
    trainers.insert(0, trainer)





Epoch,Training Loss,Validation Loss


In [30]:
for de in data:
  de[3] = de[3]['eval_f1']


In [39]:
print(tabulate(data, headers=head, tablefmt="pipe"))
max_accuracy = max(entry[3] for entry in data)
best_model = next(item for item in data if item[3] == max_accuracy)
print("The highest Accuracy %f is the model model with learning_rate = %s, per_device_train_batch_size = %s, num_train_epochs = %f" %
      (best_model[3], best_model[0], best_model[1], best_model[2]))

|   learning_rate |   per_device_train_batch_size |   num_train_epochs |   score in validation set |
|----------------:|------------------------------:|-------------------:|--------------------------:|
|           2e-05 |                            16 |                 10 |                  0.767857 |
|           2e-05 |                             8 |                 10 |                  0.806867 |
|           1e-05 |                            16 |                 10 |                  0.772093 |
|           1e-05 |                             8 |                 10 |                  0.754237 |
The highest Accuracy 0.806867 is the model model with learning_rate = 2e-05, per_device_train_batch_size = 8, num_train_epochs = 10.000000


In [40]:
training_args = TrainingArguments(
    output_dir="my_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    learning_rate = 2e-05,
    per_device_train_batch_size = 8,
    num_train_epochs = 10
    )

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,2.576097,0.720165,0.732283,0.673913,0.801724
2,No log,1.789981,0.765432,0.779923,0.706294,0.87069
3,No log,2.086571,0.736626,0.753846,0.680556,0.844828
4,No log,2.009155,0.753086,0.769231,0.694444,0.862069
5,0.034900,2.513577,0.711934,0.72,0.671642,0.775862
6,0.034900,2.426595,0.720165,0.728,0.679104,0.784483
7,0.034900,2.684336,0.720165,0.748148,0.655844,0.87069
8,0.034900,2.45231,0.72428,0.728745,0.687023,0.775862
9,0.034900,2.477492,0.72428,0.728745,0.687023,0.775862
10,0.003100,2.481918,0.72428,0.728745,0.687023,0.775862


TrainOutput(global_step=1040, training_loss=0.01830135596381017, metrics={'train_runtime': 910.2335, 'train_samples_per_second': 9.119, 'train_steps_per_second': 1.143, 'total_flos': 2183821759488000.0, 'train_loss': 0.01830135596381017, 'epoch': 10.0})

In [41]:
trainer.evaluate(tokenized_datasets["validation"])

{'eval_loss': 1.5260943174362183,
 'eval_accuracy': 0.776255707762557,
 'eval_f1': 0.7822222222222223,
 'eval_precision': 0.7394957983193278,
 'eval_recall': 0.8301886792452831,
 'eval_runtime': 7.1976,
 'eval_samples_per_second': 30.427,
 'eval_steps_per_second': 3.89,
 'epoch': 10.0}

In [48]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 1.7899813652038574,
 'eval_accuracy': 0.7654320987654321,
 'eval_f1': 0.7799227799227799,
 'eval_precision': 0.7062937062937062,
 'eval_recall': 0.8706896551724138,
 'eval_runtime': 7.9497,
 'eval_samples_per_second': 30.567,
 'eval_steps_per_second': 3.9,
 'epoch': 10.0}

In [46]:
best_trainer = trainers[1]

IndexError: ignored

In [None]:
best_trainer.evaluate(tokenized_datasets["validation"])