In [None]:
import pandas as pd
import numpy as np
import textwrap
from tqdm import tqdm
import torch

from sentence_transformers import SentenceTransformer

# load data
df = pd.read_csv("immigration_comments_with_period_label_updated.csv")

# Expecting columns: ['City', 'Period_Label', 'Text']
print(df.head())


# load model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model = SentenceTransformer("Linq-AI-Research/Linq-Embed-Mistral", device=device)


### generate embeddings

def generate_ling_mistral_embeddings(texts, chunk_size=30000, batch_size=32):
    final_embeddings = []
    for text in tqdm(texts):
        chunks = textwrap.wrap(text, width=chunk_size)
        chunk_embeddings = []
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            batch_emb = model.encode(
                batch, convert_to_numpy=True, normalize_embeddings=True, device=device
            )
            chunk_embeddings.extend(batch_emb)
        avg_embedding = np.mean(chunk_embeddings, axis=0)
        final_embeddings.append(avg_embedding)
    return np.array(final_embeddings)

# embeddings generated
embeddings = generate_ling_mistral_embeddings(df['Text'].tolist())
embeddings_df = pd.DataFrame(embeddings)
embeddings_df.to_csv("ling_mistral_embeddings.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.feature_extraction.text import TfidfVectorizer

# load embeddings
df = pd.read_csv("immigration_comments_with_period_label_updated.csv")

# exclude outliers, don't exclude deleted
df = df[(df['Outlier'] == False) & (df['Username'] != '[deleted]')].reset_index(drop=True)

# load embeddings
embeddings = pd.read_csv("ling_mistral_embeddings.csv").values
embeddings = embeddings[df.index]

# UMAP dimensionality reduction
reducer = umap.UMAP(n_neighbors=15, min_dist=0.0, metric='cosine', random_state=42)
embedding_umap = reducer.fit_transform(embeddings)


# HBDScan clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=5, metric='euclidean')
labels = clusterer.fit_predict(embedding_umap)
df['Topic_Label'] = labels

# represent topics
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
X = vectorizer.fit_transform(df['Text'])
feature_names = vectorizer.get_feature_names_out()

topics_summary = []

for topic in set(labels):
    if topic == -1:
        continue
    topic_indices = np.where(labels == topic)[0]
    topic_comments = df.iloc[topic_indices]['Text'].tolist()

    topic_tfidf = X[topic_indices].mean(axis=0)
    top_word_indices = np.array(topic_tfidf).flatten().argsort()[-10:][::-1]
    common_words = [feature_names[i] for i in top_word_indices]

    topic_embeddings = embedding_umap[topic_indices]
    centroid = np.mean(topic_embeddings, axis=0)
    distances = np.linalg.norm(topic_embeddings - centroid, axis=1)
    rep_comment_indices = distances.argsort()[:5]
    representative_comments = [topic_comments[i] for i in rep_comment_indices]

    topics_summary.append({
        "Topic_Label": topic,
        "Common_Words": ", ".join(common_words),
        "Representative_Comments": " | ".join(representative_comments)
    })

# add composition
df['City_Period'] = df['City'] + '_' + df['Period_Label']
all_city_periods = df['City_Period'].unique()

for topic in topics_summary:
    topic_label = topic['Topic_Label']
    topic_df = df[df['Topic_Label'] == topic_label]



    # composition based on stance
    stance_counts = topic_df['Stance'].value_counts(normalize=True).to_dict()
    topic['Prop_Positive'] = stance_counts.get(1, 0.0)
    topic['Prop_Neutral'] = stance_counts.get(0, 0.0)
    topic['Prop_Negative'] = stance_counts.get(-1, 0.0)

    # city period composition
    counts = topic_df['City_Period'].value_counts(normalize=True).to_dict()
    for cp in all_city_periods:
        topic[f"Prop_{cp}"] = counts.get(cp, 0.0)

# save results
df.to_csv("immigration_comments_with_topics.csv", index=False)
pd.DataFrame(topics_summary).to_csv("topic_summaries_enriched.csv", index=False)

In [11]:
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# load and filter
df = pd.read_csv("immigration_comments_with_topics.csv")
df = df[(df["2025"] == False) & (df["Outlier"] == False)].copy()

# clean
df["Text"] = df["Text"].fillna("").str.lower().str.strip()

# topics
topics = [20, 28, 122, 124, 228]
cities = ["Chicago", "Denver", "New York City"]

# function
def compare_topic_shift_by_city(city_df):
    results = []
    for topic in topics:
        c = len(city_df[(city_df["Period_Label"] == "Control") & (city_df["Topic_Label"] == topic)])
        e = len(city_df[(city_df["Period_Label"] == "Experimental") & (city_df["Topic_Label"] == topic)])
        c_total = len(city_df[city_df["Period_Label"] == "Control"])
        e_total = len(city_df[city_df["Period_Label"] == "Experimental"])

        control_pct = (c / c_total) * 100 if c_total else 0
        experimental_pct = (e / e_total) * 100 if e_total else 0
        raw_change = experimental_pct - control_pct

        if control_pct == 0:
            percent_change = float('inf') if experimental_pct > 0 else 0.0
        else:
            percent_change = ((experimental_pct - control_pct) / control_pct) * 100

        stat, pval = proportions_ztest([e, c], [e_total, c_total])

        results.append({
            "City": city_df["City"].iloc[0],
            "Topic": topic,
            "Control %": round(control_pct, 1),
            "Experimental %": round(experimental_pct, 1),
            "Raw Change": round(raw_change, 1),
            "Percent Change": round(percent_change, 1),
            "p-value": round(pval, 4),
            "Significant": pval < 0.05
        })
    return pd.DataFrame(results)

# apply
stat_results = pd.concat([compare_topic_shift_by_city(df[df["City"] == city]) for city in cities])
stat_results.to_csv("topic_significance_by_city_percent.csv", index=False)

In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Set font style
plt.rcParams["font.family"] = "serif"

# Load significance results
df = pd.read_csv("topic_significance_by_city_percent.csv")

# Define focus topics and cities
topics = [20, 28, 122, 124, 228]
cities = ["Chicago", "Denver", "New York City"]

# Filter relevant rows
df = df[df["Topic"].isin(topics) & df["City"].isin(cities)].copy()

# Create pivot tables
pivot_change = df.pivot(index="City", columns="Topic", values="Raw Change").reindex(index=cities, columns=topics)
pivot_pval = df.pivot(index="City", columns="Topic", values="p-value").reindex(index=cities, columns=topics)

# Normalize p-values for colormap (smaller p = stronger color)
norm_pvals = 1 - pivot_pval.clip(upper=0.05) / 0.05
cmap = plt.cm.coolwarm

# Set up figure
fig, ax = plt.subplots(figsize=(10, 6))
colors = cmap(norm_pvals)
im = ax.imshow(colors, aspect='auto')

# Add annotations (raw change in %)
for i in range(len(pivot_change.index)):
    for j in range(len(pivot_change.columns)):
        val = pivot_change.iloc[i, j]
        ax.text(j, i, f"{val:.1f}%", ha='center', va='center',
                fontsize=11,
                color='black' if norm_pvals.iloc[i, j] < 0.4 else 'white')

# Format axis labels
ax.set_xticks(np.arange(len(topics)))
ax.set_xticklabels([f"Topic {t}" for t in topics], fontsize=11)

ax.set_yticks(np.arange(len(cities)))
ax.set_yticklabels([c for c in cities], fontsize=11)

# Add gridlines for better readability
ax.set_xticks(np.arange(len(topics)+1)-.5, minor=True)
ax.set_yticks(np.arange(len(cities)+1)-.5, minor=True)
ax.grid(which="minor", color='white', linestyle='-', linewidth=2)
ax.tick_params(which="minor", bottom=False, left=False)

# Labels and title
ax.set_title("Change in Topic Prevalence (Experimental − Control)", fontsize=14, pad=15)
ax.set_xlabel("Unsupervised Topic", fontsize=12)
ax.set_ylabel("City", fontsize=12)

# Add matching colorbar
cbar = plt.colorbar(im, ax=ax, label="Significance (1 − p)", shrink=0.85, pad=0.02)
cbar.ax.tick_params(labelsize=10)

# Finalize
plt.tight_layout()
plt.savefig("topic_change_heatmap_readable.png")
plt.close()