<a href="https://colab.research.google.com/github/britssc/ecuadorian_fake_news_detection/blob/main/FakeNewsDetectionBertin_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Detection: BERTIN Model Fine-Tuning Process

Ecuador's 2025 Election Dataset (623 News)

Model Website: https://huggingface.co/bertin-project/bertin-roberta-base-spanish

## Installations

In [None]:
%%capture
!pip install transformers
!pip install pytorch-lightning

## Finetuning Bertin

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import numpy as np
from torch.optim import AdamW

## Dataset preparation

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = list(texts)
        self.labels = list(labels)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
df = pd.read_csv('clean_data.csv')
X = df['text'].values
y = df['real'].values

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


print("Number of training news:", len(X_train))
print("Number of validation news:", len(X_val))
print("Number of testing news:", len(X_test))

In [None]:
model_name = "bertin-project/bertin-roberta-base-spanish"  # O el modelo que quieras usar
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
train_dataset = FakeNewsDataset(X_train, y_train, tokenizer)
val_dataset = FakeNewsDataset(X_val, y_val, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

test_dataset = FakeNewsDataset(X_test, y_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16)

## Model Configuration

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

## Training

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
def train_epoch(model, dataloader, optimizer, device):
    model = model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()

        # Predicciones
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_samples += len(labels)

        # Backward pass
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy

### Evaluation

In [None]:
def eval_model(model, dataloader, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()

            # Predicciones
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_samples += len(labels)

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions.double() / total_samples
    return avg_loss, accuracy


### Finetuning

In [None]:
epochs = 3
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Entrenamiento
    train_loss, train_accuracy = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

    # Evaluación
    val_loss, val_accuracy = eval_model(model, val_dataloader, device)
    print(f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")

# 7. Guardar el modelo fine-tuneado
model.save_pretrained("fine_tuned_bertin_fake_news")
tokenizer.save_pretrained("fine_tuned_bertin_fake_news")


## Testing

In [None]:
from sklearn.metrics import classification_report
from tqdm import tqdm

def get_predictions(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, preds = torch.max(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels


In [None]:
# Getting preditions with the test data
preds, trues = get_predictions(model, test_dataloader, device)

# Metrics
from sklearn.metrics import classification_report
print(classification_report(trues, preds, digits=4))


Reporting the results

In [None]:
report_dict = classification_report(trues, preds, output_dict=True)

df_report = pd.DataFrame(report_dict).transpose()
df_report = df_report.round(4)

from IPython.display import display
display(df_report)

In [None]:
# Guardando los resultados
def save_metrics(trues, preds, model_name="model", csv_path="bertin_metrics.csv"):
    report = classification_report(trues, preds, digits=4, output_dict=True)
    row = {
        "Model": model_name,
        "Accuracy": report["accuracy"],
        "Weighted Precision": report["weighted avg"]["precision"],
        "Weighted Recall": report["weighted avg"]["recall"],
        "Macro F1": report["macro avg"]["f1-score"],
        "Class 0 F1": report["0"]["f1-score"],
        "Class 1 F1": report["1"]["f1-score"]
    }
    df = pd.DataFrame([row])
    df.to_csv(csv_path, index=False)
    print(f"Métricas guardadas en '{csv_path}'")

In [None]:
save_metrics(trues, preds, model_name="bertin")