Setup & imports

In this section we import all required libraries and initialize the OpenAI client.

In [None]:
import os
from openai import OpenAI

def get_openai_client():
    key = os.environ.get("OPENAI_API_KEY")
    
    if not key:
        key = input("Enter your OpenAI API key: ").strip()
        os.environ["OPENAI_API_KEY"] = key

    return OpenAI(api_key=key)

client = get_openai_client()
print("OpenAI client is ready.")


if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"]:
   os.environ["OPENAI_API_KEY"] = input("OPENAI-API-KEY")
print("API key OK.")
print(os.environ.get("OPENAI_API_KEY") is not None)

# create results folders relative to notebooks/
os.makedirs("../results", exist_ok=True)
os.makedirs("../results/confusion_matrices", exist_ok=True)



In [None]:
# IMPORTS AND OPENAI CLIENT

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm
from sklearn.metrics import ConfusionMatrixDisplay

# expects that OPENAI_API_KEY is set in the environment, e.g. via .env or system env
#os.environ["OPENAI_API_KEY"] = "YOUR-API-KEY"
client = OpenAI()

DATA_DIR = Path("..") / "data"
DATA_DIR, list(DATA_DIR.iterdir())

1. Load & preprocess

We load the dataset, merge headline and body text, clean missing values, and perform a basic exploratory data analysis (EDA).

In [None]:
# LOAD DATA

df = pd.read_csv(DATA_DIR / "data.csv")

df.head()

In [None]:
# PREPROCESS TEXT

df["text"] = df["Headline"].fillna("") + " " + df["Body"].fillna("")
data = df[["text", "Label"]].rename(columns={"Label": "label"})
data = data.dropna(subset=["text", "label"])

data.head()

data["label"].value_counts()

In [None]:
# EDA

plt.figure(figsize=(6,4))
sns.countplot(x="label", data=data)
plt.title("Distribution of Real vs Fake News")
plt.xlabel("Label (0 = Real, 1 = Fake)")
plt.ylabel("Count")
plt.show()

data["text_length"] = data["text"].apply(lambda x: len(x.split()))
data["text_length"].describe()

# HISTOGRAM

plt.figure(figsize=(8,4))
sns.histplot(data["text_length"], bins=50)
plt.title("Distribution of Text Lengths")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

data.groupby("label")["text_length"].mean()

2. Train / Test Split

In this section, we split the cleaned dataset into training and test sets while preserving the original class distribution (stratification).
This ensures that both the training and evaluation phases maintain a balanced representation of real (0) and fake (1) news examples.

In [None]:
# TRAIN / TEST SPLIT

X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

3. Sample Extraction (80 Examples from the Test Set)

Here we extract a small, representative sample of 80 examples from the test set.
This subset is used for:

comparing the ML model against LLM-based models

faster LLM evaluation (to reduce inference cost and time)

analyzing specific cases where models disagree

The sample consists of sample_texts (input articles) and sample_labels (ground truth labels).

In [None]:
# SAMPLE (80 exampel from the test group)

SAMPLE_SIZE = 80

X_test_sample = X_test[:SAMPLE_SIZE]
y_test_sample = y_test[:SAMPLE_SIZE]

sample_texts = X_test_sample.tolist()
sample_labels = y_test_sample.to_numpy()

len(sample_texts), len(sample_labels)


4. TF-IDF + Logistic Regression Pipeline

This section implements the traditional Machine Learning baseline for fake/real news classification:

TF–IDF vectorization of the article text

Logistic Regression classifier

Training on the training split

Evaluation on the test split

This model serves as the baseline for comparison with GPT-based classifiers.

In [None]:
# TF-IDF + LOGISTIC REGRESSION PIPELINE

model = Pipeline([
    ("tfidf", TfidfVectorizer(
        max_features=50000,
        min_df=2,
        max_df=0.8,
        stop_words="english"
    )),
    ("lr", LogisticRegression(max_iter=200))
])
model.fit(X_train, y_train)

ml_preds = model.predict(X_test)
print("Logistic Regression accuracy:", accuracy_score(y_test, ml_preds))

5. ML Model Evaluation

We evaluate the Logistic Regression model using:

accuracy

per-class precision, recall, and F1-score

This establishes a performance reference point before testing the LLMs.

In [None]:
# EVALUATION OF ML MODEL

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print()
print(classification_report(y_test, y_pred))
print()
print(confusion_matrix(y_test, y_pred))

In [None]:
# OPENAI CLIENT SETUP
client = OpenAI()  # expects your OPENAI_API_KEY in environment variables

6. classify_with_llm() — LLM Classification Function

This helper function sends a binary classification request to an LLM (GPT-4o-mini or GPT-4o).
It:

formats the prompt

requests a single prediction (“0” for REAL, “1” for FAKE)

parses the LLM output

handles API errors gracefully so notebook execution does not break

The function returns an integer prediction: 0 or 1.

In [None]:
def classify_with_llm(model_name, text):
    """
    Sends a zero-shot classification request to an LLM.
    Expected return: 0 or 1
    """
    prompt = f"""
    Classify the following news article as REAL (0) or FAKE (1).
    Answer with only 0 or 1.

    Article:
    {text}
    """

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1
        )

        pred = response.choices[0].message["content"].strip()
        return int(pred) if pred in ["0", "1"] else 0

    except Exception as e:
        print("LLM error:", e)
        return 0
    
    
# run_llm_on_sample() — Batch Evaluation Helper
def run_llm_on_sample(model_name: str, texts):
    preds = []
    for t in tqdm(texts, desc=f"{model_name} predicting"):
        preds.append(classify_with_llm(model_name, t))
    return np.array(preds, dtype=int)


In [None]:
# === Sample of 80 examples from the test set ===

SAMPLE_SIZE = 80

# use the first 80 examples from X_test / y_test
X_test_sample = X_test[:SAMPLE_SIZE]
y_test_sample = y_test[:SAMPLE_SIZE]

sample_texts = X_test_sample.tolist()
sample_labels = y_test_sample.to_numpy()

len(sample_texts), len(sample_labels)



8. GPT-4o-mini Evaluation

Here we evaluate GPT-4o-mini on the 80-sample subset.
We compute and display:

accuracy

classification report (precision, recall, F1)

GPT-4o-mini serves as a lightweight LLM baseline — cheaper and faster than full GPT-4o.

In [None]:
llm_preds_mini = []
print("Evaluating GPT-4o-mini...")

for text in tqdm(sample_texts):
    llm_preds_mini.append(classify_with_llm("gpt-4o-mini", text))

print("Accuracy (GPT-4o-mini):", accuracy_score(sample_labels, llm_preds_mini))
print(classification_report(sample_labels, llm_preds_mini))


9. GPT-4o (Large Model) Evaluation

In this section we run the same evaluation using GPT-4o (large).
We compare:

model accuracy

error patterns

improvements over GPT-4o-mini

performance relative to the ML baseline

This provides a clear picture of how model scale affects real-world classification ability.

In [None]:
llm_preds_4o = []
print("Evaluating GPT-4o (Large)...")

for text in tqdm(sample_texts):
    llm_preds_4o.append(classify_with_llm("gpt-4o", text))

print("Accuracy (GPT-4o):", accuracy_score(sample_labels, llm_preds_4o))
print(classification_report(sample_labels, llm_preds_4o))


10. Accuracy Comparison — ML vs. LLMs

We combine the accuracy scores from all three models:

Logistic Regression

GPT-4o-mini

GPT-4o

The results are visualized in a bar chart for easy comparison.

In [None]:
# Checks all variables before plotting

required_vars = ["ml_preds", "llm_preds_mini", "llm_preds_4o", "y_test", "sample_labels"]

missing = [v for v in required_vars if v not in globals()]
if missing:
    raise ValueError(f"Missing variables: {missing}. Run all ML + LLM cells first.")

results = {
    "Model": ["Logistic Regression", "GPT-4o-mini", "GPT-4o"],
    "Accuracy": [
        accuracy_score(y_test, ml_preds),
        accuracy_score(sample_labels, llm_preds_mini),
        accuracy_score(sample_labels, llm_preds_4o)
    ]
}

res_df = pd.DataFrame(results)

plt.figure(figsize=(8, 5))
sns.barplot(data=res_df, x="Model", y="Accuracy")
plt.title("Accuracy Comparison: ML vs LLM")
plt.ylim(0, 1)
plt.show()
plt.savefig("../results/accuracy_comparison.png",
            dpi=300, bbox_inches="tight")
plt.show()




11. Confusion Matrix — GPT-4o-mini

This section creates and visualizes the confusion matrix for GPT-4o-mini on the 80-sample evaluation set.
It shows:

how well the model distinguishes REAL vs. FAKE

common misclassifications

whether the model is biased toward one class

In [None]:
# CONFUSION MATRIX FOR GPT-4o-mini
cm = confusion_matrix(sample_labels, llm_preds_mini)
disp = ConfusionMatrixDisplay(cm)
disp.plot(cmap="Purples")
plt.title("Confusion Matrix — GPT-4o-mini (80 samples)")
plt.show()
plt.savefig("../results/confusion_matrices/gpt4omini_confmat.png",
            dpi=300, bbox_inches="tight")
plt.show()
