In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should pr

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [4]:
# Load and preprocess data
def load_data(folder):
    data = []
    for filename in os.listdir(folder):
        if filename.endswith('.json'):
            with open(os.path.join(folder, filename)) as f:
                file_data = json.load(f)
                for article in file_data['articles']:
                    data.append((article['title'] + ' ' + article['content'], file_data['label_text']))
    return data

In [5]:
train_data = load_data('../final_project/datasets/dataset_fake_news_task4/train_json')  
test_data = load_data('../final_project/datasets/dataset_fake_news_task4/dev_json')  

# Convert labels to integers
label_mapping = {'low': 0, 'mixed': 1, 'high': 2}
train_data = [(text, label_mapping[label]) for text, label in train_data]
test_data = [(text, label_mapping[label]) for text, label in test_data]
print(train_data[0])

('Joe Biden’s Lying Anti-Trump Ad is STILL on Twitter: Gets Highest Fake News Rating of “Four Pinocchios” Presidential candidate Joe Biden has a well-known history of lying and plagiarism .\nIt looks like his campaign is getting in on the act with their latest ad that just got the worst fake news rating possible .\nThe Washington Post gave Biden ’ s new ad “ Four Pinocchios ” for “ manipulating video ” to make it appear as though President Trump called the coronavirus a hoax .\nThe Biden campaign cut out over 120 words in between the word “ coronavirus ” and then “ This is their new hoax.\n” ( see transcript below ) In  saying “ coronavirus , ” followed immediately by “ This is their new hoax.\n” What the president was saying is that the Democratic politicization of the coronavirus is a “ hoax ” and NOT the virus itself .\nThe ad goes on to show images and words that are disconnected and made to make it seem like the president said “ The American Dream ” … ” is dead.\n” This is gutter 

In [6]:
# Convert the datasets into the format required by the model
train_encodings = tokenizer([text for text, label in train_data], truncation=True, padding=True, max_length=512)
train_labels = [label for text, label in train_data]

test_encodings = tokenizer([text for text, label in test_data], truncation=True, padding=True, max_length=512)
test_labels = [label for text, label in test_data]

In [7]:
# Define the Dataset class
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [8]:
# Create the datasets
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [9]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate the Macro-F1 score
    macro_f1 = f1_score(labels, predictions, average='macro')

    # Calculate precision, recall, f1-score
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [10]:

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results_RoBerta',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    learning_rate=5e-5,
    logging_dir='./logs',            # directory for storing logs
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
# Train the model
trainer.train()

 31%|███       | 500/1602 [06:45<14:41,  1.25it/s]

{'loss': 0.8477, 'learning_rate': 5e-05, 'epoch': 0.94}


 62%|██████▏   | 1000/1602 [13:27<07:58,  1.26it/s]

{'loss': 0.6516, 'learning_rate': 2.7313974591651543e-05, 'epoch': 1.87}


 94%|█████████▎| 1500/1602 [20:10<01:21,  1.25it/s]

{'loss': 0.4125, 'learning_rate': 4.627949183303086e-06, 'epoch': 2.81}


100%|██████████| 1602/1602 [21:35<00:00,  1.24it/s]

{'train_runtime': 1295.2665, 'train_samples_per_second': 19.761, 'train_steps_per_second': 1.237, 'train_loss': 0.6196820095981402, 'epoch': 3.0}





TrainOutput(global_step=1602, training_loss=0.6196820095981402, metrics={'train_runtime': 1295.2665, 'train_samples_per_second': 19.761, 'train_steps_per_second': 1.237, 'train_loss': 0.6196820095981402, 'epoch': 3.0})

In [12]:
# Evaluate the model
trainer.evaluate()

100%|██████████| 18/18 [00:17<00:00,  1.06it/s]


{'eval_loss': 0.9315750002861023,
 'eval_accuracy': 0.6715462031107045,
 'eval_precision': 0.6623653429798128,
 'eval_recall': 0.6715462031107045,
 'eval_f1': 0.6654584891763083,
 'eval_runtime': 18.5582,
 'eval_samples_per_second': 58.896,
 'eval_steps_per_second': 0.97,
 'epoch': 3.0}