In [2]:
import os
import re
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
  accuracy_score,
  f1_score,
  precision_score,
  recall_score,
  classification_report,
  confusion_matrix,
  roc_curve,
  auc,
)

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {DEVICE}")

  from .autonotebook import tqdm as notebook_tqdm


  from .autonotebook import tqdm as notebook_tqdm


Device: cuda


In [None]:
df = pd.read_csv("../data/politifact_statements.csv")


LABEL_MAP = {
  "True": 1,
  "Mostly True": 1,
  "Half True": 1,
  "Mostly False": 0,
  "False": 0,
  "Pants on Fire": 0,
}

df["label"] = df["verdict"].map(LABEL_MAP)
df = df.dropna(subset=["label"])
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print(f"Total samples: {len(df)}")
print(f"\nLabel distribution:")
print(df["label"].value_counts())
print(f"\nVerdict breakdown:")
print(df["verdict"].value_counts())

Total samples: 25999

Label distribution:
label
0    15978
1    10021
Name: count, dtype: int64

Verdict breakdown:
verdict
False            8644
Half True        3844
Mostly False     3828
Mostly True      3530
Pants on Fire    3506
True             2647
Name: count, dtype: int64


In [None]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r"[^a-z0-9\s.,!?]", " ", text)
  text = re.sub(r"\s+", " ", text)
  text = text.strip()

  return text


df["cleaned_text"] = df["statement"].apply(clean_text)
df.head()

Unnamed: 0,verdict,statement,statement_date,statement_source,factcheck_date,url,label,cleaned_text
0,Mostly False,"In Texas, a person can register a dead relativ...",2022-10-02,Facebook posts,2022-10-17,https://www.politifact.com/factchecks/2022/oct...,0,"in texas, a person can register a dead relativ..."
1,Mostly False,"""So far, out of 175 arrested in Kenosha, 102 w...",2020-09-01,Facebook posts,2020-09-18,https://www.politifact.com/factchecks/2020/sep...,0,"so far, out of 175 arrested in kenosha, 102 we..."
2,Half True,The decision to remove Chick-fil-A from Emory ...,2013-03-12,David Furhman,2013-04-19,https://www.politifact.com/factchecks/2013/apr...,1,the decision to remove chick fil a from emory ...
3,Pants on Fire,Aileen Cannon has been arrested.,2022-10-01,Facebook posts,2022-10-06,https://www.politifact.com/factchecks/2022/oct...,0,aileen cannon has been arrested.
4,Mostly False,"""A California health care CEO is promising to ...",2017-05-23,Ohioans Against Deceptive Rx Ballot Issue,2017-10-13,https://www.politifact.com/factchecks/2017/oct...,0,a california health care ceo is promising to r...


In [None]:
train_df, test_df = train_test_split(
  df, test_size=0.2, random_state=SEED, stratify=df["label"]
)

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 20799
Test size: 5200


---

#


In [None]:
print("Training GradientBoosting + TF-IDF...")

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["cleaned_text"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["cleaned_text"])

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
gb_model.fit(X_train_tfidf, train_df["label"])


gb_preds = gb_model.predict(X_test_tfidf)
gb_probs = gb_model.predict_proba(X_test_tfidf)[:, 1]

print("GradientBoosting training complete!")

Training GradientBoosting + TF-IDF...
GradientBoosting training complete!


In [None]:
gb_results = {
  "accuracy": accuracy_score(test_df["label"], gb_preds),
  "f1": f1_score(test_df["label"], gb_preds, average="weighted"),
  "precision": precision_score(test_df["label"], gb_preds, average="weighted"),
  "recall": recall_score(test_df["label"], gb_preds, average="weighted"),
}

print("=" * 60)
print("GRADIENTBOOSTING + TF-IDF RESULTS")
print("=" * 60)
for metric, value in gb_results.items():
  print(f"{metric.capitalize():12}: {value:.4f}")
print("=" * 60)
print("\nClassification Report:")
print(classification_report(test_df["label"], gb_preds, target_names=["Fake", "Real"]))

GRADIENTBOOSTING + TF-IDF RESULTS
Accuracy    : 0.6815
F1          : 0.6520
Precision   : 0.6782
Recall      : 0.6815

Classification Report:
              precision    recall  f1-score   support

        Fake       0.69      0.89      0.77      3196
        Real       0.67      0.35      0.46      2004

    accuracy                           0.68      5200
   macro avg       0.68      0.62      0.62      5200
weighted avg       0.68      0.68      0.65      5200

