In [8]:
# General-purpose
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Torch for modeling
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Scikit-learn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

# NLP tools
import spacy
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# Progress bar
from tqdm import tqdm, trange
import time

# Word cloud for visualization
from wordcloud import WordCloud


# Good things

In [2]:
dff = pd.read_csv('final_data.csv')
dff.head()

Unnamed: 0,title,text,Label,text_length,word_count,title_length
0,donald trump sends out embarrassing new year e...,donald trump just could not wish all american ...,0,2283,385,72
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,0,1673,248,68
2,sheriff david clarke becomes internet joke for...,friday wa revealed that former milwaukee sheri...,0,2643,422,78
3,trump obsessed even ha obama name coded into h...,christmas day donald trump announced that woul...,0,2095,338,62
4,pope francis just called out donald trump duri...,pope francis used his annual christmas day mes...,0,1990,332,69


In [3]:
spacy.cli.download("en_core_web_md")

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 33.5/33.5 MB 28.4 MB/s eta 0:00:00
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m‚ö† Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# Download necessary models
nlp = spacy.load('en_core_web_md')
# nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()


In [5]:
from sentence_transformers import SentenceTransformer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device -> {device}')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

text_embeddings = model.encode(dff['text'].tolist(), show_progress_bar=True, batch_size=32, convert_to_tensor=True, device=device)

Device -> cpu


Batches:   0%|          | 0/3111 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [5]:
torch.save({
    'text_embeddings': torch.tensor(text_embeddings)
}, 'embeddings.pth')

  'text_embeddings': torch.tensor(text_embeddings)


In [6]:
# Just for loading The text_embeddings

checkpoint = torch.load('embeddings.pth')

# Retrieve embeddings
text_embeddings = checkpoint['text_embeddings']

text_embeddings = text_embeddings.numpy()

In [9]:

# --- Sentence-level Preprocessing ---

def preprocess_article(text):
    doc = nlp(text)
    processed_sentences = []
    for sent in doc.sents:
        s = sent.text.strip()
        if len(s) > 1:
            processed_sentences.append({
                "original": s,
                "word_count": len(s.split())
            })
    return processed_sentences

# --- Feature Functions ---

def paraphrasing_rate(processed_sentences, T=0.8):
    embeddings = [nlp(s['original']).vector for s in processed_sentences]
    N = len(embeddings)
    if N <= 1:
        return 0.0
    count = 0
    for i in range(N):
        sims = [
            cosine_similarity([embeddings[i]], [embeddings[j]])[0][0]
            for j in range(N) if j != i
        ]
        if max(sims, default=0) > T:
            count += 1
    return count / N

def subjectivity_ratio(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    subj = sum(1 for s in processed_sentences
               if TextBlob(s['original']).sentiment.subjectivity > 0.5)
    return subj / N

def sentiment_intensity_ratio(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    intense = sum(1 for s in processed_sentences
                  if abs(sid.polarity_scores(s['original'])['compound']) > 0.5)
    return intense / N

def average_sentence_length(processed_sentences):
    N = len(processed_sentences)
    if N == 0:
        return 0.0
    total = sum(s['word_count'] for s in processed_sentences)
    return total / N

def manipulative_score(processed_sentences, alpha=0.5, beta=0.5):
    return alpha * subjectivity_ratio(processed_sentences) + beta * sentiment_intensity_ratio(processed_sentences)

# --- Dataset Preparation ---

def prepare_dataset(df, text_vectors):
    X, y = [], []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="üîç Preprocessing articles"):
        text = row['text']
        label = row['Label']
        vec = text_vectors[idx]  # already extracted 384-d vector

        processed = preprocess_article(text)
        f1 = paraphrasing_rate(processed)
        f2 = subjectivity_ratio(processed)
        f3 = sentiment_intensity_ratio(processed)
        f4 = average_sentence_length(processed)
        f5 = manipulative_score(processed)

        X.append(list(vec) + [f1, f2, f3, f4, f5])
        y.append(label)

    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.long)

# --- Model Definition ---

class FakeNewsClassifier(nn.Module):
    def __init__(self, input_dim=389):
        super(FakeNewsClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return torch.sigmoid(self.net(x)).squeeze(1)

# --- Training and Evaluation ---

def train(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    loop = tqdm(loader, desc="üõ†Ô∏è Training", leave=False)
    for xb, yb in loop:
        xb, yb = xb.to(device), yb.to(device).float()
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)
        loop.set_postfix(loss=loss.item())
    return total_loss / len(loader.dataset)

def evaluate(model, loader, device):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            preds = (model(xb) > 0.5).long()
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total

# --- Main Training Loop ---

# Replace with your real data
# dff = DataFrame with columns ['text', 'Label']
# text_embeddings = Numpy array with shape (N, 384)

X, y = prepare_dataset(dff, text_embeddings)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FakeNewsClassifier().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

epochs = 5
best_acc = 0

print("üèÅ Starting training...\n")

for epoch in trange(epochs, desc="üìä Epoch Progress"):
    start = time.time()

    loss = train(model, train_loader, criterion, optimizer, device)
    acc = evaluate(model, test_loader, device)

    elapsed = time.time() - start
    print(f"üìÖ Epoch {epoch+1}/{epochs} ‚Äî üß† Loss: {loss:.4f} | üéØ Test Acc: {acc:.4f} | ‚è±Ô∏è Time: {elapsed:.2f}s")

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), "best_fake_news.pt")
        print("‚úÖ New best model saved!")

print(f"\n‚úÖ Training done. Best test accuracy: {best_acc:.4f}")

üîç Preprocessing articles:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 41994/99531 [37:16<1:09:08, 13.87it/s] 