In [1]:
# General-purpose
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Torch for modeling
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Scikit-learn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# NLP tools
import spacy
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Progress bar
from tqdm import tqdm

# Word cloud for visualization
from wordcloud import WordCloud


# Good things

In [2]:
dff = pd.read_csv('final_data.csv')
dff.head()

Unnamed: 0,title,text,Label,text_length,word_count,title_length
0,donald trump sends out embarrassing new year e...,donald trump just could not wish all american ...,0,2283,385,72
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,0,1673,248,68
2,sheriff david clarke becomes internet joke for...,friday wa revealed that former milwaukee sheri...,0,2643,422,78
3,trump obsessed even ha obama name coded into h...,christmas day donald trump announced that woul...,0,2095,338,62
4,pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...,0,1990,332,69


In [5]:
spacy.cli.download("en_core_web_md")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 33.5/33.5 MB 63.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# Download necessary models
nlp = spacy.load('en_core_web_md')
# nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()


In [4]:
from sentence_transformers import SentenceTransformer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device -> {device}')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

text_embeddings = model.encode(dff['text'].tolist(), show_progress_bar=True, batch_size=32, convert_to_tensor=True, device=device)

Device -> cpu


Batches:   0%|          | 0/3111 [00:00<?, ?it/s]

In [None]:
torch.save({
    'text_embeddings': torch.tensor(text_embeddings)
}, 'embeddings.pth')

In [None]:
# Just for loading The text_embeddings

checkpoint = torch.load('embeddings.pth')

# Retrieve embeddings
text_embeddings = checkpoint['text_embeddings']

text_embeddings = text_embeddings.numpy()

In [None]:
# --- Feature Functions on Tokens (per-article) ---

def paraphrasing_rate(processed_sentences, T=0.8):
    embeddings = [nlp(s['original']).vector for s in processed_sentences]
    N = len(embeddings)
    if N <= 1:
        return 0.0
    count = 0
    for i in range(N):
        sims = [
            cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
            for j in range(N) if j != i
        ]
        if max(sims, default=0) > T:
            count += 1
    return count / N

def subjectivity_ratio(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    subj = sum(1 for s in processed_sentences
               if TextBlob(s['original']).sentiment.subjectivity > 0.5)
    return subj / N

def sentiment_intensity_ratio(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    intense = sum(1 for s in processed_sentences
                  if abs(sid.polarity_scores(s['original'])['compound']) > 0.5)
    return intense / N

def average_sentence_length(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    total = sum(s['word_count'] for s in processed_sentences)
    return total / N

def manipulative_score(processed_sentences, alpha=0.5, beta=0.5):
    return alpha * subjectivity_ratio(processed_sentences) + beta * sentiment_intensity_ratio(processed_sentences)


# --- Dataset Preparation ---

def prepare_dataset(df, text_vectors):
    """
    Args:
        rows: list of dicts per article, containing 'processed_sentences' and target 'label'
        text_vectors: Nx384 numpy array of text embeddings
    Returns:
        features: torch.Tensor shape (N, 389), where 384 dims + 5 new features
        labels: torch.LongTensor shape (N,)
    """
    X, y = [], []
    for vec, row in zip(text_vectors, df):
        f1 = paraphrasing_rate(df['text'])
        f2 = subjectivity_ratio(df['text'])
        f3 = sentiment_intensity_ratio(df['text'])
        f4 = average_sentence_length(df['text'])
        f5 = manipulative_score(df['text'])
        X.append(list(vec) + [f1, f2, f3, f4, f5])
        y.append(df['Label'])
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)


# --- Model Definition ---

class FakeNewsClassifier(nn.Module):
    def __init__(self, input_dim=389):
        super(FakeNewsClassifier, self).__init__()
        hidden1 = 512
        hidden2 = 256
        self.net = nn.Sequential(
                    nn.Linear(input_dim, hidden1),
                    nn.BatchNorm1d(hidden1),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(hidden1, hidden2),
                    nn.BatchNorm1d(hidden2),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(hidden2, 1)  # Final output
                )

    def forward(self, x):
        return torch.sigmoid(self.net(x)).squeeze(1)

# --- Training and Evaluation Helpers ---

def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device).float()
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            probs = model(xb)
            preds = (probs > 0.5).long()
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total


# Startin ........................

X, y = prepare_dataset(dff, text_embeddings)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FakeNewsClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 5
best_acc = 0
for epoch in range(epochs):
    loss = train(model, train_loader, criterion, optimizer, device)
    acc = evaluate(model, test_loader, device)
    print(f"Epoch {epoch+1}/{epochs} — Loss: {loss:.4f}, Test Acc: {acc:.4f}")

    # optional: early stopping or checkpointing
    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_fake_news.pt")
        print("✅ New best!")

print(f"✅ Training done. Best test accuracy: {best_acc:.4f}")
