# Political Bias EDA & Tokenization Pipeline

This notebook performs end-to-end exploratory data analysis (EDA) and tokenization for your TruthSocial/Bluesky datasets.

**What it does:**
1. Discover and load merged datasets (global and per topic)
2. Clean HTML → plain text
3. Tokenize with `nltk.tokenize`
4. Global EDA: length distribution, top words, platform/topic counts
5. Sentiment (VADER)
6. Semantic embeddings + UMAP visualization
7. TF–IDF keyword comparison by platform
8. Topic-level EDA (loop over each topic file)

> Tip: Run **Setup** first to install required packages and download NLTK corpora.

In [2]:
import sys, transformers, sentence_transformers, umap
from sentence_transformers import SentenceTransformer
print("python:", sys.executable)
print("transformers:", transformers.__version__)
print("sentence-transformers:", sentence_transformers.__version__)
print("umap:", umap.__version__)

python: /Users/yukkihsu/opt/anaconda3/bin/python
transformers: 4.44.2
sentence-transformers: 3.0.1
umap: 0.5.9.post2


In [3]:
#!pip install nltk beautifulsoup4 ftfy emoji matplotlib wordcloud scikit-learn umap-learn sentence-transformers datasets transformers

import os, json, glob, re
from pathlib import Path

import nltk
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

from bs4 import BeautifulSoup
import ftfy
import emoji

import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer

try:
    from sentence_transformers import SentenceTransformer
    import umap
    _HAS_EMB = True
except Exception:
    _HAS_EMB = False

print('Setup complete. _HAS_EMB =', _HAS_EMB)

Setup complete. _HAS_EMB = True


In [5]:
# === Configuration ===
BASE_DIR = Path('.')  # change if needed
DATASET_DIR = BASE_DIR / 'dataset'
MERGED_GLOBAL = DATASET_DIR / 'merged_all_posts.json'
MERGED_BY_TOPIC_DIR = DATASET_DIR / 'merged_by_topic'
FIG_DIR = BASE_DIR / 'figs'
FIG_DIR.mkdir(parents=True, exist_ok=True)

STOPWORDS = set(stopwords.words('english'))
URL_RE = re.compile(r'https?://\S+')
MENTION_RE = re.compile(r'@\w[\w\.]*')
HASHTAG_RE = re.compile(r'#\w+')

def canonical_id(p):
    return p.get('id') or p.get('uri') or p.get('url')

def clean_text(html_or_text: str) -> str:
    text = BeautifulSoup(html_or_text or '', 'html.parser').get_text(' ')
    text = ftfy.fix_text(text)
    text = URL_RE.sub(' <URL> ', text)
    text = MENTION_RE.sub(' <USER> ', text)
    text = HASHTAG_RE.sub(' <HASHTAG> ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text: str):
    toks = [w.lower() for w in word_tokenize(text) if w.isalpha()]
    toks = [w for w in toks if w not in STOPWORDS]
    return toks

def load_topic_frames():
    # optional: brings in your {topic: {'left':[], 'right':[], 'neutral':[]}}
    tf_py = BASE_DIR / 'topic_frames.py'
    tf_json = BASE_DIR / 'topic_frames.json'
    frames = {}
    if tf_py.exists():
        import importlib.util
        spec = importlib.util.spec_from_file_location('topic_frames', str(tf_py))
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
        frames = getattr(mod, 'TOPIC_FRAMES', {})
    elif tf_json.exists():
        frames = json.loads(tf_json.read_text('utf-8'))
    return frames

TOPIC_FRAMES = load_topic_frames()
print('Loaded TOPIC_FRAMES keys:', list(TOPIC_FRAMES.keys())[:3], '...')

Loaded TOPIC_FRAMES keys: ['Abortion and Reproductive Policy', 'Gun Policy and Firearms Regulation', 'Climate and Environmental Policy'] ...


In [6]:
# === Load merged global dataset (if present) ===
rows = []
if MERGED_GLOBAL.exists():
    data = json.loads(MERGED_GLOBAL.read_text('utf-8'))
    seen = set()
    for p in data:
        pid = canonical_id(p)
        if not pid or pid in seen:
            continue
        seen.add(pid)
        meta = p.get('__meta__', {}) or {}
        txt = clean_text(p.get('content') or p.get('text') or '')
        if not txt:
            continue
        rows.append({
            'id': pid,
            'text': txt,
            'topic': meta.get('topic'),
            'keyword': meta.get('matched_keyword'),
            'platform': meta.get('platform', 'unknown')
        })
print(f'Loaded global rows: {len(rows)}')

Loaded global rows: 38195


In [8]:
# === Global EDA ===
import numpy as np

lengths = [len(r['text'].split()) for r in rows]
print('Total posts:', len(rows))
if lengths:
    print('Avg length:', round(np.mean(lengths), 2), 'Median:', int(np.median(lengths)))

# Plot length distribution
plt.figure()
plt.hist(lengths, bins=50)
plt.title('Text Length Distribution (word count) — Global')
plt.xlabel('Words per post')
plt.ylabel('Frequency')
plt.savefig(FIG_DIR / 'global_length_distribution.png', dpi=150, bbox_inches='tight')
plt.close()

# Platform and topic counts
from collections import Counter
plat_counts = Counter(r['platform'] for r in rows if r.get('platform'))
top_counts = Counter(r['topic'] for r in rows if r.get('topic'))
print('Platforms:', plat_counts)
print('Top topics:', top_counts.most_common(10))

# Bar plots
plt.figure()
plt.bar(list(plat_counts.keys()), list(plat_counts.values()))
plt.title('Posts per Platform — Global')
plt.savefig(FIG_DIR / 'global_posts_per_platform.png', dpi=150, bbox_inches='tight')
plt.close()

if top_counts:
    labels, vals = zip(*top_counts.most_common(20))
    plt.figure(figsize=(8,5))
    y_pos = range(len(labels))
    plt.barh(y_pos, vals)
    plt.yticks(y_pos, labels)
    plt.title('Top 20 Topics — Global')
    plt.savefig(FIG_DIR / 'global_top_topics.png', dpi=150, bbox_inches='tight')
    plt.close()

# Tokenization + top words
all_tokens = []
for r in rows:
    all_tokens.extend(tokenize_text(r['text']))
freq = Counter(all_tokens)
print('Most common words:', freq.most_common(20))

# Top words bar chart
if freq:
    words, counts = zip(*freq.most_common(25))
    plt.figure(figsize=(8,5))
    y = range(len(words))
    plt.barh(y, counts)
    plt.yticks(y, words)
    plt.title('Top 25 Words — Global')
    plt.savefig(FIG_DIR / 'global_top_words.png', dpi=150, bbox_inches='tight')
    plt.close()

print('Saved global figures to', FIG_DIR)

Total posts: 38195
Avg length: 130.22 Median: 68
Platforms: Counter({'truthsocial': 38195})
Top topics: [('Abortion and Reproductive Policy', 2000), ('Climate and Environmental Policy', 1994), ('Basic Income and Welfare Programs', 1985), ('Foreign Policy and National Defense', 1985), ('Drug Policy and Substance Regulation', 1980), ('Free Speech and Content Regulation', 1979), ('Gun Policy and Firearms Regulation', 1973), ('Immigration and Border Policy', 1953), ('Policing and Criminal Justice Reform', 1951), ('Voting and Election Policy', 1946)]
Most common words: [('trump', 16180), ('people', 12970), ('https', 9883), ('government', 9711), ('like', 9254), ('president', 8916), ('america', 8227), ('one', 7703), ('law', 7400), ('us', 7364), ('federal', 7067), ('would', 7050), ('state', 6929), ('american', 6629), ('states', 6361), ('new', 6306), ('url', 6220), ('democrats', 6174), ('public', 5971), ('rights', 5792)]
Saved global figures to figs


In [9]:
# === Sentiment (VADER) ===
sia = SentimentIntensityAnalyzer()
sent = [sia.polarity_scores(r['text'])['compound'] for r in rows]
plt.figure()
plt.hist(sent, bins=50)
plt.title('Sentiment (compound) — Global')
plt.savefig(FIG_DIR / 'global_sentiment_hist.png', dpi=150, bbox_inches='tight')
plt.close()

# By platform boxplot-like summary (manual)
by_platform = defaultdict(list)
for r, s in zip(rows, sent):
    by_platform[r['platform']].append(s)

labels = list(by_platform.keys())
plt.figure()
data = [by_platform[k] for k in labels]
plt.boxplot(data, labels=labels, showmeans=True)
plt.title('Sentiment by Platform — Global')
plt.savefig(FIG_DIR / 'global_sentiment_by_platform.png', dpi=150, bbox_inches='tight')
plt.close()


In [7]:
# === UMAP Embeddings (optional: requires sentence-transformers & umap) ===
if _HAS_EMB and rows:
    texts = [r['text'] for r in rows]
    platforms = [r['platform'] for r in rows]
    model = SentenceTransformer('all-MiniLM-L6-v2')
    import numpy as np
    n = len(texts)
    if n > 8000:
        idx = np.random.choice(n, 8000, replace=False)
        texts = [texts[i] for i in idx]
        platforms = [platforms[i] for i in idx]
    emb = model.encode(texts, convert_to_numpy=True, show_progress_bar=True)
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
    xy = reducer.fit_transform(emb)
    # Scatter by platform
    plt.figure(figsize=(6,5))
    labs = sorted(set(platforms))
    for lab in labs:
        idx = [i for i, p in enumerate(platforms) if p == lab]
        plt.scatter(xy[idx,0], xy[idx,1], s=4, label=lab, alpha=0.7)
    plt.title('UMAP — colored by platform (global)')
    plt.legend(markerscale=3)
    plt.savefig(FIG_DIR / 'global_umap_platform.png', dpi=150, bbox_inches='tight')
    plt.close()
else:
    print('UMAP skipped: sentence-transformers/umap not available.')

Batches: 100%|██████████| 250/250 [03:27<00:00,  1.21it/s]
  warn(
OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [8]:
# === TF-IDF top terms by platform ===
platform_buckets = defaultdict(list)
for r in rows:
    platform_buckets[r['platform']].append(r['text'])

for name, texts in platform_buckets.items():
    if len(texts) < 50:
        continue
    vec = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=5, max_df=0.5,
                          token_pattern=r"[A-Za-z<>#@][A-Za-z0-9_<>\-']+")
    X = vec.fit_transform(texts)
    scores = X.mean(axis=0).A1
    terms = vec.get_feature_names_out()
    top = sorted(zip(terms, scores), key=lambda x: x[1], reverse=True)[:20]
    # Plot
    terms_plot = [t for t, _ in top][::-1]
    vals_plot = [s for _, s in top][::-1]
    plt.figure(figsize=(7,5))
    y = range(len(terms_plot))
    plt.barh(y, vals_plot)
    plt.yticks(y, terms_plot)
    plt.title(f'Top TF-IDF terms — {name}')
    plt.savefig(FIG_DIR / f'tfidf_{name}.png', dpi=150, bbox_inches='tight')
    plt.close()


In [9]:
# === Topic-level EDA (iterate merged_by_topic/*.json if present) ===
topic_files = sorted(glob.glob(str(MERGED_BY_TOPIC_DIR / '*_posts.json')))
print('Found topic files:', len(topic_files))

for path in topic_files:
    topic_slug = Path(path).name.replace('_posts.json','')
    data = json.loads(Path(path).read_text('utf-8'))
    topic_rows = []
    seen_ids = set()
    for p in data:
        pid = canonical_id(p)
        if not pid or pid in seen_ids:
            continue
        seen_ids.add(pid)
        meta = p.get('__meta__', {}) or {}
        txt = clean_text(p.get('content') or p.get('text') or '')
        if not txt:
            continue
        topic_rows.append({'id': pid, 'text': txt, 'platform': meta.get('platform','unknown')})
    if not topic_rows:
        continue

    # length dist
    lengths = [len(r['text'].split()) for r in topic_rows]
    plt.figure()
    plt.hist(lengths, bins=50)
    plt.title(f'Text Length — {topic_slug}')
    plt.savefig(FIG_DIR / f'{topic_slug}_length.png', dpi=150, bbox_inches='tight')
    plt.close()

    # tokens top words
    tokens = []
    for r in topic_rows:
        tokens.extend(tokenize_text(r['text']))
    freq = Counter(tokens)
    if freq:
        words, counts = zip(*freq.most_common(20))
        plt.figure(figsize=(7,5))
        y = range(len(words))
        plt.barh(y, counts)
        plt.yticks(y, words)
        plt.title(f'Top 20 words — {topic_slug}')
        plt.savefig(FIG_DIR / f'{topic_slug}_top_words.png', dpi=150, bbox_inches='tight')
        plt.close()

print('Topic-level figures saved to', FIG_DIR)

Found topic files: 20
Topic-level figures saved to figs
