# CLEF2023 Task 2: Subjectivity in news articles

In [17]:
# All imports
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict


In [37]:

# Reading the dataset
train_data = pd.read_csv('data/subtask-2-english/train_en.tsv', delimiter='\t')
validation_data = pd.read_csv('data/subtask-2-english/dev_en.tsv', delimiter='\t')
testing_data = pd.read_csv('data/subtask-2-english/test_en.tsv', delimiter='\t')
testing_data_gold = pd.read_csv('data/subtask-2-english/test_en_gold.tsv', delimiter='\t')


train_data['label'] = train_data['label'].replace({'SUBJ': 0, 'OBJ': 1})
validation_data['label'] = validation_data['label'].replace({'SUBJ': 0, 'OBJ': 1})

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_data)
validation_dataset = Dataset.from_pandas(validation_data)
testing_dataset = Dataset.from_pandas(testing_data)
testing_gold_dataset = Dataset.from_pandas(testing_data_gold)

# Create DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': testing_dataset,
    'test_gold': testing_gold_dataset
})

print(dataset_dict)


DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict'],
        num_rows: 830
    })
    validation: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict'],
        num_rows: 219
    })
    test: Dataset({
        features: ['sentence_id', 'sentence'],
        num_rows: 243
    })
    test_gold: Dataset({
        features: ['sentence_id', 'sentence', 'label'],
        num_rows: 243
    })
})


In [38]:
from transformers import AutoTokenizer
import numpy as np

roberta_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [39]:
def tokenize_function(examples):
    
    """ This function tokenizes the text in the examples dictionary.
        We pass it to the map function of the dataset so that we can batch the tokenization for efficiency by
        tokenizing batches in parallel.
    """
    return roberta_tokenizer(examples["sentence"], padding="max_length", truncation=True)

In [40]:
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/830 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

Map:   0%|          | 0/243 [00:00<?, ? examples/s]

In [41]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 830
    })
    validation: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'solved_conflict', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 219
    })
    test: Dataset({
        features: ['sentence_id', 'sentence', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 243
    })
    test_gold: Dataset({
        features: ['sentence_id', 'sentence', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 243
    })
})

In [42]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [43]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [44]:
# !pip install evaluate
# 1.

import numpy as np
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 2.

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [46]:
trainer.args.device

device(type='cuda', index=0)

In [48]:
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [11]:
import pandas as pd
from transformers import AutoTokenizer
import torch

def tokenize_data(dataset):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def tokenize_function(example):
        encoded_set = tokenizer(example["sentence"], padding="max_length", truncation=True)
        return encoded_set

    encoded_dataset = dataset.apply(tokenize_function, axis=1)

    # Convert the encoded dataset to a torch.Tensor with data type specified as int64
    # encoded_dataset = torch.tensor(list(encoded_dataset), dtype=torch.int64)

    return encoded_dataset


In [6]:
encoded_dataset = tokenize_data(train_data)

Unnamed: 0,sentence_id,sentence,label,solved_conflict
0,b9e1635a-72aa-467f-86d6-f56ef09f62c3,Gone are the days when they led the world in r...,SUBJ,True
1,f99b5143-70d2-494a-a2f5-c68f10d09d0a,The trend is expected to reverse as soon as ne...,OBJ,False
2,4076639c-aa56-4202-ae0f-9d9217f8da68,But there is the specious point again.,OBJ,False
3,b057c366-698e-419d-a284-9b16d835c64e,He added he wouldn’t be surprised to see a new...,OBJ,False
4,a5a9645e-7850-41ba-90a2-5def725cd5b8,"Not less government, you see; the same amount ...",SUBJ,False
...,...,...,...,...
825,9a0f5eec-cc36-49b8-88eb-20ad2c056eaa,Local governments and their financing vehicles...,SUBJ,False
826,73545884-adf8-480c-a8b5-e65128ba8e91,That fact alone underscores the biggest proble...,SUBJ,False
827,a8825286-21a4-46c9-9410-c0e7e183d708,Presumably it had in mind those Russian offici...,SUBJ,False
828,c984fc97-2604-4690-a2c0-d748703663af,"From bad taxation, reckless borrowing and reck...",SUBJ,False


In [15]:

# Splitting the data into features and labels
train_texts = train_data['sentence'].values
train_labels = train_data['label'].values

# Splitting the training data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing the texts
train_encodings = tokenizer.batch_encode_plus(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer.batch_encode_plus(val_texts.tolist(), truncation=True, padding=True)
test_encodings = tokenizer.batch_encode_plus(testing_data['sentence'].tolist(), truncation=True, padding=True)

# Creating the dataset objects
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Convert labels to long type
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
val_dataset = ClassificationDataset(val_encodings, val_labels)
test_dataset = ClassificationDataset(test_encodings, testing_data_gold['label'].values)

# Define the model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: classification_report(val_labels, pred.predictions.argmax(-1), digits=4)
)

# Train the model
trainer.train()

# Evaluate on the test set
predictions = trainer.predict(test_dataset)
print(classification_report(testing_data_gold['label'], predictions.predictions.argmax(-1), digits=4))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

TypeError: new(): invalid data type 'str'

In [14]:
# Train the model
trainer.train()

# Evaluate on the test set
predictions = trainer.predict(test_dataset)
print(classification_report(testing_data_gold['label'], predictions.predictions.argmax(-1), digits=4))


TypeError: new(): invalid data type 'str'