In [45]:
# Cell 1 — dependencies
import requests
from bs4 import BeautifulSoup
import re

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [46]:
# Cell 2 — define your training URLs + labels
data = [
    # (url, label)
    # Trump–Zelenskyy
    ("https://www.bbc.com/news/articles/clyj3j2r25wo", "center"),
    ("https://www.cnn.com/2025/02/28/politics/trump-zelensky-vance-oval-office/index.html", "center"),
    ("https://www.foxnews.com/politics/trump-vance-zelenskyy-spar-over-russian-war-tense-exchange-very-disrespectful", "right"),
    ("https://www.msnbc.com/top-stories/latest/zelenskyy-trump-vance-meeting-ukraine-shouting-match-rcna194282", "left"),
    # DEI removal
    ("https://www.bbc.com/news/articles/c24110m30ddo", "center"),
    ("https://www.cnn.com/2025/03/07/us/university-dei-housing-scholarships-college/index.html", "center"),
    ("https://www.foxnews.com/politics/trump-signs-executive-orders-banning-radical-gender-ideology-dei-initiatives-military", "right"),
    ("https://www.msnbc.com/top-stories/latest/trump-federal-diversity-equity-inclusion-employees-dei-leave-rcna188721", "left"),
    # Department of Education
    ("https://www.bbc.com/news/articles/c79zxzj90nno", "center"),
    ("https://www.cnn.com/2025/03/22/us/department-of-education-resources-disabled-children/index.html", "center"),
    ("https://www.foxnews.com/politics/trump-sign-executive-order-abolish-department-education", "right"),
    ("https://www.msnbc.com/top-stories/latest/education-dept-impeach-judges-lawyers-trump-newsletter-rcna197077", "left"),
    # Deportation
    ("https://www.bbc.com/news/articles/cp9yv1gnzyvo", "center"),
    # (add CNN link here if available),
    ("https://www.foxnews.com/politics/trump-tells-illegal-immigrants-self-deport-using-cbp-home-app-new-video", "right"),
    ("https://www.msnbc.com/opinion/msnbc-opinion/judge-boasberg-trump-alien-enemies-act-argument-rcna198463", "left"),
    # Gold cards
    ("https://www.bbc.com/news/articles/cq5zgvdz2z0o", "center"),
    ("https://www.foxnews.com/world/trump-touts-5-million-gold-card-new-path-citizenship", "right"),
    ("https://www.msnbc.com/top-stories/latest/gold-visas-trump-fox-news-interview-rcna197352", "left"),
    # Elon Musk & Doge
    ("https://www.bbc.com/news/articles/c23vkd57471o", "center"),
    ("https://www.foxnews.com/politics/elon-musk-strategizes-trillion-spending-cuts-house-doge-panel-closed-door-meeting", "right"),
    ("https://www.msnbc.com/opinion/msnbc-opinion/elon-musk-doge-mass-firings-government-rcna193563", "left"),
]


In [47]:
# Cell 3 — article‐scraping helper (unchanged)
def fetch_article_text(url):
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")
    paras = soup.find_all("p")
    text = " ".join(p.get_text() for p in paras)
    return re.sub(r"\s+", " ", text).strip()

In [48]:
# Cell 4 — build scraped DataFrame
rows = []
for url, label in data:
    try:
        txt = fetch_article_text(url)
        rows.append({"text": txt, "label": label})
    except Exception as e:
        print(f"ERROR fetching {url}: {e}")

df_scrape = pd.DataFrame(rows)
print("Scraped data distribution:\n", df_scrape.label.value_counts())

Scraped data distribution:
 label
center    9
right     6
left      6
Name: count, dtype: int64


In [51]:
# Cell 5 — load your CSV dataset, inspect & rename columns, then merge

# 1) load
df_csv = pd.read_csv("Political_Bias.csv")

# 2) inspect
print("Columns found in Political_Bias.csv:", df_csv.columns.tolist())

# 3) rename to our standard names
df_csv = df_csv.rename(columns={
    'Text': 'text',
    'Bias': 'label'
})

# 4) sanity check
assert all(col in df_csv.columns for col in ['text','label']), (
    "Failed to rename columns — "
    f"available: {df_csv.columns.tolist()}"
)

# 5) keep only what we need
df_csv = df_csv[['text','label']]
print("\nLoaded CSV data distribution:\n", df_csv.label.value_counts())

# 6) now combine with scraped data
df_all = pd.concat([df_scrape, df_csv], ignore_index=True)
print("\nCombined data distribution:\n", df_all.label.value_counts())


Columns found in Political_Bias.csv: ['Title', 'Link', 'Text', 'Source', 'Bias']

Loaded CSV data distribution:
 label
left          1865
lean left      534
right          529
center         319
lean right     211
Name: count, dtype: int64

Combined data distribution:
 label
left          1871
right          535
lean left      534
center         328
lean right     211
Name: count, dtype: int64


In [53]:
# Cell 6 — clean, then train/test split & vectorization

# 0) drop any rows with missing text or label
df_all = df_all.dropna(subset=['text', 'label']).reset_index(drop=True)

# 1) split
X_train, X_test, y_train, y_test = train_test_split(
    df_all["text"],
    df_all["label"],
    test_size=0.2,
    random_state=42,
    stratify=df_all["label"]
)

# 2) vectorize
vect = TfidfVectorizer(stop_words="english", max_df=0.8, min_df=2)
X_train_tfidf = vect.fit_transform(X_train)
X_test_tfidf  = vect.transform(X_test)


In [54]:
# Cell 7 — train classifier and print clear metrics
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)

y_pred = clf.predict(X_test_tfidf)
acc = accuracy_score(y_test, y_pred)

print("=== Classification Report ===")
print(f"Accuracy: {acc:.2f}\n")
print(classification_report(
    y_test,
    y_pred,
    labels=clf.classes_,
    zero_division=0
))

=== Classification Report ===
Accuracy: 0.74

              precision    recall  f1-score   support

      center       0.96      0.40      0.57        57
   lean left       0.89      0.38      0.54       104
  lean right       1.00      0.05      0.09        42
        left       0.69      1.00      0.81       374
       right       0.94      0.63      0.75       107

    accuracy                           0.74       684
   macro avg       0.90      0.49      0.55       684
weighted avg       0.80      0.74      0.70       684



In [68]:
# Cell 8 — clearer prediction helper
def classify_article(input_, return_proba=False):
    """
    Predicts 'left', 'center', or 'right' for a URL or raw text,
    then prints a formatted summary.
    """
    # fetch vs. raw
    if input_.startswith("http"):
        text = fetch_article_text(input_)
    else:
        text = input_

    tf = vect.transform([text])
    pred = clf.predict(tf)[0]
    probs = dict(zip(clf.classes_, clf.predict_proba(tf)[0]))

    print("=== Bias Prediction ===")
    print(f"Predicted leaning:  {pred.upper()}")
    print("Class probabilities:")
    for label, p in probs.items():
        print(f"  - {label.capitalize():6s}: {p:.2%}")

    if return_proba:
        return pred, probs
    return pred

# Example usage:
# Trump–Zelenskyy argument
classify_article("https://www.bbc.com/news/articles/clyj3j2r25wo")
classify_article("https://www.cnn.com/2025/02/28/politics/trump-zelensky-vance-oval-office/index.html")
classify_article("https://www.foxnews.com/politics/trump-vance-zelenskyy-spar-over-russian-war-tense-exchange-very-disrespectful")
classify_article("https://www.msnbc.com/top-stories/latest/zelenskyy-trump-vance-meeting-ukraine-shouting-match-rcna194282")

# DEI Removal
classify_article("https://www.bbc.com/news/articles/c24110m30ddo")
classify_article("https://www.cnn.com/2025/03/07/us/university-dei-housing-scholarships-college/index.html")
classify_article("https://www.foxnews.com/politics/trump-signs-executive-orders-banning-radical-gender-ideology-dei-initiatives-military")
classify_article("https://www.msnbc.com/top-stories/latest/trump-federal-diversity-equity-inclusion-employees-dei-leave-rcna188721")

# Department of Education
classify_article("https://www.bbc.com/news/articles/c79zxzj90nno")
classify_article("https://www.cnn.com/2025/03/22/us/department-of-education-resources-disabled-children/index.html")
classify_article("https://www.foxnews.com/politics/trump-sign-executive-order-abolish-department-education")
classify_article("https://www.msnbc.com/top-stories/latest/education-dept-impeach-judges-lawyers-trump-newsletter-rcna197077")

# Deportation of people
classify_article("https://www.bbc.com/news/articles/cp9yv1gnzyvo")
classify_article("https://www.cnn.com/2025/05/03/politics/what-happens-with-us-citizen-children-caught-up-in-trumps-deportation-push")
classify_article("https://www.foxnews.com/politics/trump-tells-illegal-immigrants-self-deport-using-cbp-home-app-new-video")
classify_article("https://www.msnbc.com/opinion/msnbc-opinion/judge-boasberg-trump-alien-enemies-act-argument-rcna198463")

# Gold cards
classify_article("https://www.bbc.com/news/articles/cq5zgvdz2z0o")
classify_article("https://www.cnn.com/2025/03/02/politics/gold-card-trump-congress-cec/index.html")
classify_article("https://www.foxnews.com/world/trump-touts-5-million-gold-card-new-path-citizenship")
classify_article("https://www.msnbc.com/top-stories/latest/gold-visas-trump-fox-news-interview-rcna197352")

# Impact of Elon Musk and Doge
classify_article("https://www.bbc.com/news/articles/c23vkd57471o")
classify_article("https://www.cnn.com/2025/03/05/politics/elon-musk-rogan-interview-empathy-doge/index.html")
classify_article("https://www.foxnews.com/politics/elon-musk-strategizes-trillion-spending-cuts-house-doge-panel-closed-door-meeting")
classify_article("https://www.msnbc.com/opinion/msnbc-opinion/elon-musk-doge-mass-firings-government-rcna193563")



=== Bias Prediction ===
Predicted leaning:  CENTER
Class probabilities:
  - Center: 29.39%
  - Lean left: 19.06%
  - Lean right: 6.52%
  - Left  : 24.64%
  - Right : 20.39%
=== Bias Prediction ===
Predicted leaning:  RIGHT
Class probabilities:
  - Center: 19.65%
  - Lean left: 22.73%
  - Lean right: 8.14%
  - Left  : 24.09%
  - Right : 25.39%
=== Bias Prediction ===
Predicted leaning:  RIGHT
Class probabilities:
  - Center: 9.14%
  - Lean left: 18.59%
  - Lean right: 6.86%
  - Left  : 16.15%
  - Right : 49.27%
=== Bias Prediction ===
Predicted leaning:  LEFT
Class probabilities:
  - Center: 10.42%
  - Lean left: 15.82%
  - Lean right: 6.40%
  - Left  : 42.04%
  - Right : 25.33%
=== Bias Prediction ===
Predicted leaning:  LEFT
Class probabilities:
  - Center: 22.98%
  - Lean left: 13.12%
  - Lean right: 6.94%
  - Left  : 38.54%
  - Right : 18.41%
=== Bias Prediction ===
Predicted leaning:  LEFT
Class probabilities:
  - Center: 21.66%
  - Lean left: 18.05%
  - Lean right: 6.69%
  - Left 

'left'