# Fake news detection: Roberta Model

Model: https://huggingface.co/winterForestStump/Roberta-fake-news-detector

In the dataset
*   0: fake new
*   1: real new

## Installations

In [1]:
%%capture
!pip install transformers
!pip install pytorch-lightning

## Finetuning Roberta

In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import numpy as np
from torch.optim import AdamW


## Dataset preparation

In [3]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = list(texts)
        self.labels = list(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

## Data uploading
Upload locally the "Datos.xlsx" file.

In [19]:
df = pd.read_excel('Datos.xlsx')
X = df['text'].values
y = df['real'].values


df.reset_index(drop=True, inplace=True)
X = df['text'].values
y = df['real'].values


X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42)

print("Number of training news:", len(X_train))
print("Number of validation news:", len(X_val))
print("Number of testing news:", len(X_test))


Number of training news: 435
Number of validation news: 93
Number of testing news: 94


In [6]:
model_name = "winterForestStump/Roberta-fake-news-detector"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

In [8]:
train_dataset = FakeNewsDataset(X_train, y_train, tokenizer)
val_dataset = FakeNewsDataset(X_val, y_val, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

test_dataset = FakeNewsDataset(X_test, y_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16)

## Model Configuration

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

config.json:   0%|          | 0.00/859 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [10]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

### Training

In [11]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [12]:
def train_epoch(model, dataloader, optimizer, device):
    model = model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        # Predictions
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_samples += len(labels)

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy

### Evaluation

In [13]:
def eval_model(model, dataloader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Predictions
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += len(labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy


### Finetuning

In [14]:
epochs = 3
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Training
    train_loss, train_accuracy = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Evaluation
    val_loss, val_accuracy = eval_model(model, val_dataloader, device)
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# Saving the model
model.save_pretrained("fine_tuned_roberta_fake_news")
tokenizer.save_pretrained("fine_tuned_roberta_fake_news")



Epoch 1/3


Training: 100%|██████████| 28/28 [00:35<00:00,  1.28s/it]


Train Loss: 1.1173, Train Accuracy: 0.5977


Evaluating: 100%|██████████| 6/6 [00:02<00:00,  2.68it/s]


Val Loss: 0.5738, Val Accuracy: 0.6989

Epoch 2/3


Training: 100%|██████████| 28/28 [00:36<00:00,  1.29s/it]


Train Loss: 0.4179, Train Accuracy: 0.8414


Evaluating: 100%|██████████| 6/6 [00:02<00:00,  2.44it/s]


Val Loss: 0.4885, Val Accuracy: 0.7849

Epoch 3/3


Training: 100%|██████████| 28/28 [00:37<00:00,  1.35s/it]


Train Loss: 0.2925, Train Accuracy: 0.8966


Evaluating: 100%|██████████| 6/6 [00:02<00:00,  2.26it/s]


Val Loss: 0.5800, Val Accuracy: 0.7634


('fine_tuned_roberta_fake_news/tokenizer_config.json',
 'fine_tuned_roberta_fake_news/special_tokens_map.json',
 'fine_tuned_roberta_fake_news/vocab.json',
 'fine_tuned_roberta_fake_news/merges.txt',
 'fine_tuned_roberta_fake_news/added_tokens.json',
 'fine_tuned_roberta_fake_news/tokenizer.json')

## Testing

In [15]:
from sklearn.metrics import classification_report
from tqdm import tqdm

def get_predictions(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels


In [16]:
# Getting preditions with the test data
preds, trues = get_predictions(model, test_dataloader, device)

# Metrics
from sklearn.metrics import classification_report
print(classification_report(trues, preds, digits=4))


Evaluating: 100%|██████████| 6/6 [00:02<00:00,  2.23it/s]

              precision    recall  f1-score   support

           0     0.8621    0.6579    0.7463        38
           1     0.8000    0.9286    0.8595        56

    accuracy                         0.8191        94
   macro avg     0.8310    0.7932    0.8029        94
weighted avg     0.8251    0.8191    0.8137        94






Reporting the results

In [17]:
report_dict = classification_report(trues, preds, output_dict=True)

df_report = pd.DataFrame(report_dict).transpose()
df_report = df_report.round(4)

from IPython.display import display
display(df_report)

Unnamed: 0,precision,recall,f1-score,support
0,0.8621,0.6579,0.7463,38.0
1,0.8,0.9286,0.8595,56.0
accuracy,0.8191,0.8191,0.8191,0.8191
macro avg,0.831,0.7932,0.8029,94.0
weighted avg,0.8251,0.8191,0.8137,94.0
