<a href="https://colab.research.google.com/github/alessandrossC/Detecting_Fake_News/blob/main/04_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install -U datasets --quiet

In [None]:
!pip install captum

In [None]:
import os
import joblib
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset, load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from captum.attr import IntegratedGradients
from captum.attr import visualization

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import logging
logging.disable(logging.WARNING)
from functools import partial
import torch.nn.functional as F

# Set random seeds for reproducibility
torch.manual_seed(7)
np.random.seed(7)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
DATA_PATH = '/content/processed'

train_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv.zip'), compression='zip')
val_df = pd.read_csv(os.path.join(DATA_PATH, 'val.csv.zip'), compression='zip')

In [None]:
train_df = train_df[['title', 'text', 'is_fake']]
val_df = val_df[['title', 'text', 'is_fake']]

text_columns = ['title', 'text']

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset
# Use copy=True to avoid the NumPy warning
train_dataset = Dataset.from_pandas(train_df.copy())
val_dataset = Dataset.from_pandas(val_df.copy())

In [None]:
# Initialize tokenizer
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch_size = 16

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):

    return tokenizer(
        example["title"],
        example["text"],
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors=None
    )

def prepare_dataset(dataset, tokenize_function):
    # dataset — це HuggingFace Dataset, а не DataFrame!
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=['title', 'text']
    )
    tokenized_dataset = tokenized_dataset.rename_column("is_fake", "labels")
    return tokenized_dataset


In [None]:
# Create datasets using tokenize_function
tokenized_train_dataset = prepare_dataset(train_dataset, tokenize_function)
tokenized_val_dataset = prepare_dataset(val_dataset, tokenize_function)

Map:   0%|          | 0/28595 [00:00<?, ? examples/s]

Map:   0%|          | 0/7149 [00:00<?, ? examples/s]

In [None]:
tokenized_train_dataset.to_parquet(DATA_PATH + '/tokenized_train_dataset.parquet')
tokenized_val_dataset.to_parquet(DATA_PATH + '/tokenized_val_dataset.parquet')

Creating parquet from Arrow format:   0%|          | 0/29 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

11123844

In [None]:
data_files = {
  'train': 'tokenized_train_dataset.parquet',
  'validation': 'tokenized_val_dataset.parquet',

}

raw_datasets = load_dataset('parquet', data_dir=DATA_PATH, data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [None]:
tokenized_train_dataset = raw_datasets['train']
tokenized_val_dataset = raw_datasets['validation']

In [None]:
# Create data loaders with DataCollatorWithPadding
train_loader = DataLoader(
    tokenized_train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator
)

val_loader = DataLoader(
    tokenized_val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator
)

In [None]:
# Training parameters
batch_size = 16
learning_rate = 1e-5
epochs = 2
warmup_steps = 1000

In [None]:
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Loss function
loss_fn = nn.CrossEntropyLoss()

In [None]:
def accuracy_fn(y_true, y_pred):
    """
    Calculates accuracy between truth labels and predictions.

    Args:
        y_true (torch.Tensor): Ground truth labels.
        y_pred (torch.Tensor): Raw logits or predicted labels.

    Returns:
        float: Accuracy as a percentage (0-100).
    """
    # Якщо y_pred — logits, потрібно взяти argmax:
    if y_pred.ndim > 1 and y_pred.size(1) > 1:
        y_pred_labels = torch.argmax(y_pred, dim=1)
    else:
        y_pred_labels = y_pred

    correct = torch.eq(y_true, y_pred_labels).sum().item()
    acc = (correct / len(y_pred_labels)) * 100
    return acc

In [None]:
def train_step(model, data_loader, optimizer, scheduler, loss_fn, accuracy_fn, device):
    """Training step for one epoch."""
    model.train()
    train_loss, train_acc = 0, 0

    for batch in tqdm(data_loader, desc='Training'):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits = outputs.logits

        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        train_acc += accuracy_fn(labels, logits.argmax(dim=1))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    train_loss /= len(data_loader)
    train_acc /= len(data_loader)
    print(f"Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%")
    return train_loss

In [None]:
def test_step(model, data_loader, loss_fn, accuracy_fn, device):
    model.eval()
    test_loss, test_acc = 0, 0

    predictions = []
    true_labels = []
    probabilities = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = loss_fn(logits, labels)
            test_loss += loss.item()
            test_acc += accuracy_fn(labels, logits.argmax(dim=1))

            # Get probabilities for log-loss calculation
            probs = torch.nn.functional.softmax(logits, dim=-1)
            probabilities.extend(probs.cpu().numpy())

            # Get predictions for F1 score
            preds = logits.argmax(dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    predictions = np.array(predictions)
    true_labels = np.array(true_labels)

    # Calculate metrics
    test_loss /= len(data_loader)
    test_acc /= len(data_loader)

    f1 = f1_score(true_labels, predictions)


    print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}% | Test F1 Score: {f1:.4f}")
    return {
        'loss': test_loss,
        'accuracy': test_acc,
        'f1_score': f1,
        'predictions': predictions,
        'true_labels': true_labels,
    }

In [None]:
# Training Loop
train_losses = []
val_losses = []
val_accuracies = []
val_f1_scores = []

print("Starting training...")

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    print("-" * 50)

    # Training
    train_loss = train_step(model, train_loader, optimizer, scheduler, loss_fn, accuracy_fn, device)
    train_losses.append(train_loss)

    # Validation (залишаєш)
    val_results = test_step(model, val_loader, loss_fn, accuracy_fn, device)
    val_losses.append(val_results['loss'])
    val_accuracies.append(val_results['accuracy'])
    val_f1_scores.append(val_results['f1_score'])

print("\nTraining completed!")

print(f"\nFinal Results:")
print(f"Best Validation Log Loss: {min(val_losses):.4f}")
print(f"Best Validation F1 Score: {max(val_f1_scores):.4f}")
print(f"Best Validation Accuracy: {max(val_accuracies):.2f}%")

Starting training...

Epoch 1/2
--------------------------------------------------


Training:   0%|          | 0/1788 [00:00<?, ?it/s]

Train loss: 0.08483 | Train accuracy: 96.35%


Evaluating:   0%|          | 0/447 [00:00<?, ?it/s]

Test loss: 0.00590 | Test accuracy: 99.92% | Test F1 Score: 0.9992

Epoch 2/2
--------------------------------------------------


Training:   0%|          | 0/1788 [00:00<?, ?it/s]

Train loss: 0.00205 | Train accuracy: 99.95%


Evaluating:   0%|          | 0/447 [00:00<?, ?it/s]

Test loss: 0.00048 | Test accuracy: 99.99% | Test F1 Score: 0.9999

Training completed!

Final Results:
Best Validation Log Loss: 0.0005
Best Validation F1 Score: 0.9999
Best Validation Accuracy: 99.99%


model demonstrates outstanding performance on the validation set, achieving nearly perfect F1 score and accuracy. Such results may indicate very strong learning, but it is also worth checking for potential data leakage, label errors, or a too-easy classification task. If the validation set is representative and correctly separated, this means the model has learned to distinguish fake news with almost complete confidence.

In [None]:
MODEL_PATH = '/content/models/fack_news_model'
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)
print("\nModel saved to './fack_news_model'")


Model saved to './fack_news_model'


In [None]:
def predict_is_fake(title, text, model, tokenizer, device):
    """
    Predict if a news article is fake (1) or real (0)
    """
    model.eval()

    encoding = tokenizer(
        title,
        text,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)

    probability_fake = predictions[0][1].item()  # Probability of class 1 ("is_fake"=1)
    is_fake = probability_fake > 0.5

    return {
        'is_fake': is_fake,
        'probability': probability_fake,
        'confidence': max(probability_fake, 1 - probability_fake)
    }

# Example usage:
example_title = "Big changes coming to city parking regulations"
example_text = "The city council has announced a series of major reforms to parking..."
result = predict_is_fake(example_title, example_text, model, tokenizer, device)

print("\nExample Prediction:")
print(f"Title: {example_title}")
print(f"Text: {example_text}")
print(f"Is Fake: {result['is_fake']}")
print(f"Probability (fake): {result['probability']:.4f}")
print(f"Confidence: {result['confidence']:.4f}")


Example Prediction:
Title: Big changes coming to city parking regulations
Text: The city council has announced a series of major reforms to parking...
Is Fake: True
Probability (fake): 0.9991
Confidence: 0.9991


In [None]:
experiment_results = {
    'model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'log_loss_train': [0.0273, 0.0303, 0.1997],
    'log_loss_val': [0.0310, 0.0867, 0.2001],
}

In [None]:
DATA_PATH = '/content/processed'
joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/processed/experiment_results.joblib']

In [None]:
experiment_results = []

In [None]:
results = {}
results['model'] = 'BERT Fine-Tuning'
results['log_loss_train'] = np.round(train_losses[-1], 5)
results['log_loss_val'] = np.round(val_losses[-1], 5)
experiment_results.append(results)

In [None]:
DATA_PATH = '/content/processed'

joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/processed/experiment_results.joblib']

In [None]:
# Load fine-tuned model and tokenizer
model_path = '/content/models/fack_news_model'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def custom_forward(model, embeddings, attention_mask):
    outputs = model(inputs_embeds=embeddings, attention_mask=attention_mask)
    return F.softmax(outputs.logits, dim=1)[:, 1]

In [None]:
ig = IntegratedGradients(partial(custom_forward, model))

# Loop over random news articles
for i in np.random.choice(val_df.index, size=3, replace=False):
    title = val_df.loc[i, 'title']
    text = val_df.loc[i, 'text']
    true_label = val_df.loc[i, 'is_fake']

    print(f"\n🔹 Example {i}:")
    print(f"Title: {title}")
    print(f"Text: {text[:120]}...")
    print(f"Is Fake: {bool(true_label)}")

    # Tokenize
    inputs = tokenizer(
        title,
        text,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors='pt'
    )
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]

    # Get prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        pred_label = "Fake" if probs[0][1] > 0.5 else "Real"

    # Compute embeddings
    input_embeddings = model.bert.embeddings(input_ids)

    # Get attributions
    attributions, delta = ig.attribute(
        inputs=input_embeddings,
        additional_forward_args=(attention_mask,),
        return_convergence_delta=True,
    )

    # Prepare tokens and attribution scores
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    attr_scores = attributions[0].sum(dim=-1).detach().cpu().numpy().tolist()

    # Prepare visualization record
    viz_data_record = visualization.VisualizationDataRecord(
        word_attributions=attr_scores,
        pred_prob=probs[0][1].item(),
        pred_class=pred_label,
        true_class="Fake" if true_label else "Real",
        attr_class="Fake",
        attr_score=sum(attr_scores),
        raw_input_ids=tokens,
        convergence_score=delta.item()
    )

    # Visualize
    visualization.visualize_text([viz_data_record])


🔹 Example 6551:
Title: Angelina Jolie condemns sexual violence against Rohingya women refugees
Text: DHAKA (Reuters) - Filmstar Angelina Jolie has condemned sexual violence inflicted on Rohingya women in Myanmar s Rakhine...
Is Fake: False


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Real,Real (0.00),Fake,-27.92,"[CLS] angelina jo ##lie condemn ##s sexual violence against ro ##hing ##ya women refugees [SEP] dhaka ( reuters ) - films ##tar angelina jo ##lie has condemned sexual violence inflicted on ro ##hing ##ya women in myanmar s ra ##kh ##ine state , where a military counter - insurgency operation has sent hundreds of thousands of ro ##hing ##ya muslim refugees across the border to bangladesh . more than 600 , 000 ro ##hing ##ya muslims have fled buddhist - majority myanmar since late august , driven out by the military s actions that a top united nations official has described as a classic case of ethnic cleansing . jo ##lie , a special envoy of the united nations high commissioner for refugees ( un ##hc ##r ) , told a bangladesh delegation in the canadian city of vancouver that she planned to visit the ro ##hing ##ya victims of sexual violence . later she mentioned accordingly in her keynote speech about the sexual violence faced by almost each female ro ##hing ##ya who fled to bangladesh and condemned the armed conflict in myanmar , bangladesh s foreign ministry said in a statement on thursday . it gave no details of jo ##lie s proposed trip . on thursday , new york - based human rights watch accused myanmar security forces of committing widespread rape against women and girls as part of a campaign of ethnic cleansing . the all ##ega ##tion echoes an accusation this week by pr ##ami ##la pat ##ten , the [SEP]"
,,,,



🔹 Example 5892:
Title: Maine governor rejects latest budget deal, ready for shutdown
Text: (Reuters) - Maine Governor Paul LePage said on Friday he would reject the latest budget deal by a bipartisan group of le...
Is Fake: False


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Real,Real (0.00),Fake,-0.17,"[CLS] maine governor rejects latest budget deal , ready for shut ##down [SEP] ( reuters ) - maine governor paul le ##page said on friday he would reject the latest budget deal by a bi ##partisan group of legislators , warning that most of the state government would shut down without more spending cuts . the second - term republican said repeatedly this week that he would be ready to close down non - essential government services as the july 4 holiday weekend starts if the law ##makers cannot agree on a $ 7 . 05 ##5 billion two - year budget that would require no new taxes . the deal reached late on thursday called for $ 7 . 1 billion in spending . “ i ’ m out of ink , ” le ##page told reporters in the state capital of augusta . “ there will not be a signature on a budget that increases taxes . ” the full legislature was due to vote on friday , the day after a six - member bi ##partisan budget committee reached a deal that would repeal a measure that voters approved in november to impose an additional 3 percent income tax on state residents who earn more than $ 200 , 000 a year . the proposed budget also increases public education funding by $ 162 million . the leaders of maine ’ s democrat - controlled house of representatives and republican - controlled senate supported the measure , but it was unclear if [SEP]"
,,,,



🔹 Example 2193:
Title: WHOA! FIRST LADY MELANIA Makes Classy, Yet Blistering Response To Kathy Griffin’s ISIS Inspired Photo Of Decapitated President Trump
Text: Social media erupted yesterday when outspoken comedian Kathy Griffin, posed for a photo shoot with controversial photogr...
Is Fake: True


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
Fake,Fake (1.00),Fake,0.03,"[CLS] whoa ! first lady mel ##ania makes class ##y , yet b ##list ##ering response to kathy griffin ’ s isis inspired photo of dec ##ap ##itated president trump [SEP] social media erupted yesterday when outspoken comedian kathy griffin , posed for a photo shoot with controversial photographer tyler shields where she held a blood ##ied , dec ##ap ##itated mask of president trump . today , mel ##ania trump questioned kathy griffin s mental health in a b ##list ##ering statement after she posted a photo of her holding a blood ##ied , dec ##ap ##itated version of president trump s head . the video posted on twitter of griffin holding the dec ##ap ##itated head of president trump has since been removed . as a mother , a wife , and a human being , that photo is very disturbing , the first lady said in a statement wednesday . when you consider some of the atrocities happening in the world today , a photo opportunity like this is simply wrong and makes you wonder about the mental health of the person who did it . daily caller ##pres ##ide ##nt trump took to twitter to respond to the un ##bel ##ie ##va ##bly vile comedian kathy griffin . trump s t ##wee ##t also reminded griffin that he has an 11 - year old son who has the ability to see her disgusting depiction of his father s blood ##ied , dec ##ap ##itated head : kathy griffin should be ashamed [SEP]"
,,,,
