# Latent Dirichlet Allocation

Following Mirko Draca and Carlo Schwarz selection of WVS questions and waves

In [None]:
pip install -r requirements.txt

In [19]:
import os
import pandas as pd

df_wvs = pd.read_csv('../data/raw/wvs_ts_w1_w7.csv')

In [20]:
import pandas as pd

def recode_survey_responses(df, question_columns, neutral_values={3, 5}):
    """
    Followin Draca & Schwarz (2024) methodology, this function recodes responses from the chosen waves (4-7) from the World Value Surve into
    two indicator variables (support and oppose), imputing missing values, and calculating shares.

    Parameters:
    - df (pd.DataFrame): Survey DataFrame.
    - question_columns (list): List of columns to transform.
    - neutral_values (set): Values representing neutrality.
    
    Returns:
    - pd.DataFrame: Transformed DataFrame with support/oppose indicators.
    """
    new_df = df_wvs.rename(columns={"COUNTRY_ALPHA": "country", "S020": "year"}).copy()

    # Impute missing values (negative values) with the sample mean of non-missing data
    for col in question_columns:
        # Impute missing values
        valid_values = new_df[new_df[col] >= 0][col]  # Exclude negative values (missing data)
        mean_value = valid_values.mean()
        new_df[col] = new_df[col].apply(lambda x: mean_value if x < 0 else x)
    
    for col in question_columns:
        # Recode based on specific column logic
        if col == "C002":  # 1–3 scale (agree-disagree)
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x == 1 else 0)  # 1 means agree (support)
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x == 2 else 0)  # 2 means disagree (oppose)
        elif col == "G006":  # 1–4 scale (1 and 2 = support, 3 and 4 = oppose)
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x in [1, 2] else 0)
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x in [3, 4] else 0)
        elif col in ["E036", "E037", "E039"]:  # 1–10 scale
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x >= 6 else 0)  # 6-10 = support
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x <= 4 else 0)  # 1-4 = oppose
        elif "F1" in col:  # 1–10 scale for F1... questions
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x >= 6 else 0)  # 6-10 = support
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x <= 4 else 0)  # 1-4 = oppose
        else:  # Binary 0–1
            new_df[f"{col}_support"] = new_df[col]
            new_df[f"{col}_oppose"] = 1 - new_df[col]  # If it's binary, 1 - value gives the opposite

    # Keep only relevant columns (support/oppose + country, year)
    interest_columns = ["country", "year"] + [f"{col}_support" for col in question_columns] + [f"{col}_oppose" for col in question_columns]
    new_df = new_df[interest_columns]
    
    return new_df


In [21]:
# Test the function
question_columns = ["A124_02", "A124_06", "A124_07", "A124_08", "A124_09", 
                    "C002", "E036", "E037", "E039", "F114A", "F115", "F116", 
                    "F117", "F118", "F119", "F120", "F121", "F122", "F123"]

df_encoded = recode_survey_responses(df_wvs, question_columns)

In [22]:
## Exploring transformed file
# Check the first few rows of the transformed DataFrame
print(df_encoded.head())

  country  year  A124_02_support  A124_06_support  A124_07_support  \
0     ALB  1998              0.0              0.0              1.0   
1     ALB  1998              0.0              0.0              1.0   
2     ALB  1998              0.0              0.0              1.0   
3     ALB  1998              0.0              0.0              1.0   
4     ALB  1998              0.0              0.0              1.0   

   A124_08_support  A124_09_support  C002_support  E036_support  E037_support  \
0              1.0              1.0             1             0             1   
1              1.0              1.0             1             0             0   
2              1.0              1.0             1             0             1   
3              1.0              1.0             1             0             1   
4              1.0              1.0             1             0             1   

   ...  F114A_oppose  F115_oppose  F116_oppose  F117_oppose  F118_oppose  \
0  ...          

In [23]:
### Split the data frame in the four waves

# Wave 4 1999 - 2004
df_wave4 = df_encoded[df_encoded['year'].between(1999, 2004)].copy()

# Wave 5 2005 - 2009
df_wave5 = df_encoded[df_encoded['year'].between(2005, 2009)].copy()

# Wave 6 2010 - 2014
df_wave6 = df_encoded[df_encoded['year'].between(2010, 2014)].copy()

# Wave 7 2017 - 2022
df_wave7 = df_encoded[df_encoded['year'].between(2017, 2022)].copy()

In [None]:
#Create a new DataFrame with the recoded responses
grouped_df = recode_survey_responses(df_wvs, question_columns)

# LDA Application

In [10]:
import json
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dictionary from JSON file
with open("variable_dict.json", "r", encoding="utf-8") as file:
    variable_dict = json.load(file)

# Assuming df_encoded is the DataFrame with the survey data after recoding.
# Let's inspect df_encoded for LDA
print(df_encoded.head())

  country  year  A124_02_support  A124_06_support  A124_07_support  \
0     ALB  1998              0.0              0.0              1.0   
1     ALB  1998              0.0              0.0              1.0   
2     ALB  1998              0.0              0.0              1.0   
3     ALB  1998              0.0              0.0              1.0   
4     ALB  1998              0.0              0.0              1.0   

   A124_08_support  A124_09_support  C002_support  E036_support  E037_support  \
0              1.0              1.0             1             0             1   
1              1.0              1.0             1             0             0   
2              1.0              1.0             1             0             1   
3              1.0              1.0             1             0             1   
4              1.0              1.0             1             0             1   

   ...  F114A_oppose  F115_oppose  F116_oppose  F117_oppose  F118_oppose  \
0  ...          

In [None]:
### Este sí jaló

# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_encoded.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]

#### Este sí jaló

In [None]:
# Display Top Issues for Each Ideology
top_issues = topic_words.apply(lambda x: x.nlargest(5).index.tolist(), axis=0)
top_issues

# Save the descriptive DataFrame to a CSV file
top_issues.to_csv('top_issues.csv', index=False)

Unnamed: 0,Ideology_1,Ideology_2,Ideology_3,Ideology_4,Ideology_5
0,A124_07_support,F119_oppose,F121_support,F115_support,A124_06_support
1,A124_09_support,F123_oppose,A124_02_oppose,F114A_support,A124_02_support
2,A124_08_support,F120_oppose,F117_oppose,F116_support,F119_oppose
3,F118_oppose,A124_02_oppose,A124_06_oppose,F121_support,F123_oppose
4,F119_oppose,A124_06_oppose,A124_09_oppose,F120_support,F117_oppose


In [29]:
### Folders for the results

# Subfolders exist
os.makedirs("../reports/lda_evaluation", exist_ok=True)
os.makedirs("../reports/top_issues", exist_ok=True)

# Example for saving evaluation results
#eval_df.to_csv(f"reports/lda_evaluation/wave{wave_number}_evaluation.csv", index=False)

# Later when saving top issues:
#top_issues_df.to_csv(f"reports/top_issues/wave{wave_number}_top_issues.csv", index=False)

In [37]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import os
from itertools import combinations
from joblib import Parallel, delayed

# Setup
num_folds = 10
topic_range = range(1, 11)  # K = 1 to 10
data_folder = "data"
results_folder = "../reports/model_evaluation"
os.makedirs(results_folder, exist_ok=True)

def compute_npmi(issue_list, test_data):
    """Compute NPMI for a list of top issues using test data."""
    N = len(test_data)
    epsilon = 1e-10

    binary_data = test_data[issue_list].astype(int)
    issue_counts = binary_data.sum(axis=0).to_dict()

    npmi_scores = []
    for i, j in combinations(issue_list, 2):
        p_i = issue_counts[i] / N
        p_j = issue_counts[j] / N
        p_ij = ((binary_data[i] & binary_data[j]).sum()) / N

        if p_ij > 0:
            pmi = np.log(p_ij / (p_i * p_j + epsilon))
            npmi = pmi / (-np.log(p_ij + epsilon))
            npmi_scores.append(npmi)

    return np.mean(npmi_scores) if npmi_scores else 0

def evaluate_model_with_npmi(model, test_data, train_data):
    """Evaluate LDA model using average NPMI across all topics."""
    feature_names = train_data.columns
    topic_words = pd.DataFrame(model.components_, columns=feature_names)
    top_issues_per_topic = topic_words.apply(lambda x: x.nlargest(5).index.tolist(), axis=1)

    topic_npmis = [compute_npmi(issue_list, test_data) for issue_list in top_issues_per_topic]
    return np.mean(topic_npmis)

In [39]:
def process_wave_from_df(wave_number, df_wave):
    print(f"\n🌊 Processing Wave {wave_number}")
    
    lda_data = df_wave.drop(columns=["country", "year"])
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

    # Initialize matrices to store results and track NPMI scores for each K
    evaluation_matrix = np.zeros((num_folds, len(topic_range)))
    npmi_scores_per_topic = {k: [] for k in topic_range}

    # Fit models and evaluate each fold
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(lda_data)):
        print(f"📂 Fold {fold_idx+1}/{num_folds}")
        
        train_data = lda_data.iloc[train_idx]
        test_data = lda_data.iloc[test_idx]

        for k_idx, n_topics in enumerate(topic_range):
            lda = LatentDirichletAllocation(
                n_components=n_topics,
                doc_topic_prior=0.25,
                topic_word_prior=0.1,
                learning_method='online',
                learning_decay=0.7,
                learning_offset=10.0,
                max_iter=20,
                batch_size=1000,
                evaluate_every=-1,
                mean_change_tol=0.001,
                max_doc_update_iter=100,
                n_jobs=-1,
                random_state=25
            )
            lda.fit(train_data)

            # Evaluate with NPMI for the current fold and topic count
            npmi_score = evaluate_model_with_npmi(lda, test_data, train_data)
            evaluation_matrix[fold_idx, k_idx] = npmi_score

            # Track NPMI scores for each topic count
            npmi_scores_per_topic[n_topics].append(npmi_score)

    # Calculate average NPMI for each topic count
    avg_npmis = {k: np.mean(npmi_scores_per_topic[k]) for k in topic_range}

    # Find the optimal number of topics (K) based on maximum average NPMI
    optimal_k = max(avg_npmis, key=avg_npmis.get)
    print(f"Optimal number of topics for Wave {wave_number}: K={optimal_k}")

    # Save the evaluation matrix and the optimal K
    eval_df = pd.DataFrame(evaluation_matrix, columns=[f"K={k}" for k in topic_range])
    eval_df.to_csv(f"{results_folder}/wave{wave_number}_evaluation.csv", index=False)
    print(f"✅ Saved evaluation results for Wave {wave_number}")

# Run for waves 4–7 using the pre-split DataFrames
Parallel(n_jobs=4)(
    delayed(process_wave_from_df)(wave_number, df)
    for wave_number, df in zip(range(4, 8), [df_wave4, df_wave5, df_wave6, df_wave7])
)

print("\n🎯 Done! All waves processed using NPMI evaluation.")


🌊 Processing Wave 6

🌊 Processing Wave 5
🌊 Processing Wave 4


🌊 Processing Wave 7
📂 Fold 1/10
📂 Fold 1/10
📂 Fold 1/10
📂 Fold 1/10


KeyboardInterrupt: 