# Latent Dirichlet Allocation

Following Mirko Draca and Carlo Schwarz selection of WVS questions and waves

In [None]:
pip install -r requirements.txt

In [4]:
import os
import pandas as pd

df_wvs = pd.read_csv('../data/raw/wvs_ts_w1_w7.csv')

In [5]:
import pandas as pd

def recode_survey_responses(df, question_columns, neutral_values={3, 5}):
    """
    Followin Draca & Schwarz (2024) methodology, this function recodes responses from the chosen waves (4-7) from the World Value Surve into
    two indicator variables (support and oppose), imputing missing values, and calculating shares.

    Parameters:
    - df (pd.DataFrame): Survey DataFrame.
    - question_columns (list): List of columns to transform.
    - neutral_values (set): Values representing neutrality.
    
    Returns:
    - pd.DataFrame: Transformed DataFrame with support/oppose indicators.
    """
    new_df = df_wvs.rename(columns={"COUNTRY_ALPHA": "country", "S020": "year"}).copy()

    # Impute missing values (negative values) with the sample mean of non-missing data
    for col in question_columns:
        # Impute missing values
        valid_values = new_df[new_df[col] >= 0][col]  # Exclude negative values (missing data)
        mean_value = valid_values.mean()
        new_df[col] = new_df[col].apply(lambda x: mean_value if x < 0 else x)
    
    for col in question_columns:
        # Recode based on specific column logic
        if col == "C002":  # 1–3 scale (agree-disagree)
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x == 1 else 0)  # 1 means agree (support)
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x == 2 else 0)  # 2 means disagree (oppose)
        elif col == "G006":  # 1–4 scale (1 and 2 = support, 3 and 4 = oppose)
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x in [1, 2] else 0)
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x in [3, 4] else 0)
        elif col in ["E036", "E037", "E039"]:  # 1–10 scale
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x >= 6 else 0)  # 6-10 = support
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x <= 4 else 0)  # 1-4 = oppose
        elif "F1" in col:  # 1–10 scale for F1... questions
            new_df[f"{col}_support"] = new_df[col].apply(lambda x: 1 if x >= 6 else 0)  # 6-10 = support
            new_df[f"{col}_oppose"] = new_df[col].apply(lambda x: 1 if x <= 4 else 0)  # 1-4 = oppose
        else:  # Binary 0–1
            new_df[f"{col}_support"] = new_df[col]
            new_df[f"{col}_oppose"] = 1 - new_df[col]  # If it's binary, 1 - value gives the opposite

    # Keep only relevant columns (support/oppose + country, year)
    interest_columns = ["country", "year"] + [f"{col}_support" for col in question_columns] + [f"{col}_oppose" for col in question_columns]
    new_df = new_df[interest_columns]
    
    return new_df


In [6]:
# Test the function
question_columns = ["A124_02", "A124_06", "A124_07", "A124_08", "A124_09", 
                    "C002", "E036", "E037", "E039", "F114A", "F115", "F116", 
                    "F117", "F118", "F119", "F120", "F121", "F122", "F123"]

df_encoded = recode_survey_responses(df_wvs, question_columns)

In [7]:
## Exploring transformed file
# Check the first few rows of the transformed DataFrame
print(df_encoded.head())

  country  year  A124_02_support  A124_06_support  A124_07_support  \
0     ALB  1998              0.0              0.0              1.0   
1     ALB  1998              0.0              0.0              1.0   
2     ALB  1998              0.0              0.0              1.0   
3     ALB  1998              0.0              0.0              1.0   
4     ALB  1998              0.0              0.0              1.0   

   A124_08_support  A124_09_support  C002_support  E036_support  E037_support  \
0              1.0              1.0             1             0             1   
1              1.0              1.0             1             0             0   
2              1.0              1.0             1             0             1   
3              1.0              1.0             1             0             1   
4              1.0              1.0             1             0             1   

   ...  F114A_oppose  F115_oppose  F116_oppose  F117_oppose  F118_oppose  \
0  ...          

In [None]:
### Split the data frame in the four waves

# Wave 4 1999 - 2004
df_wave4 = df_encoded[df_encoded['year'].between(1999, 2004)].copy()

# Wave 5 2005 - 2009
df_wave5 = df_encoded[df_encoded['year'].between(2005, 2009)].copy()

# Wave 6 2010 - 2014
df_wave6 = df_encoded[df_encoded['year'].between(2010, 2014)].copy()

# Wave 7 2017 - 2022
df_wave7 = df_encoded[df_encoded['year'].between(2017, 2022)].copy()

In [None]:
#Create a new DataFrame with the recoded responses
grouped_df = recode_survey_responses(df_wvs, question_columns)

# LDA Application

In [10]:
import json
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dictionary from JSON file
with open("variable_dict.json", "r", encoding="utf-8") as file:
    variable_dict = json.load(file)

# Assuming df_encoded is the DataFrame with the survey data after recoding.
# Let's inspect df_encoded for LDA
print(df_encoded.head())

  country  year  A124_02_support  A124_06_support  A124_07_support  \
0     ALB  1998              0.0              0.0              1.0   
1     ALB  1998              0.0              0.0              1.0   
2     ALB  1998              0.0              0.0              1.0   
3     ALB  1998              0.0              0.0              1.0   
4     ALB  1998              0.0              0.0              1.0   

   A124_08_support  A124_09_support  C002_support  E036_support  E037_support  \
0              1.0              1.0             1             0             1   
1              1.0              1.0             1             0             0   
2              1.0              1.0             1             0             1   
3              1.0              1.0             1             0             1   
4              1.0              1.0             1             0             1   

   ...  F114A_oppose  F115_oppose  F116_oppose  F117_oppose  F118_oppose  \
0  ...          

In [None]:
# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_encoded.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]



In [None]:
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation

# Define the number of topics for LDA
num_topics = 10

# Dictionary of waves (Modify these variables to match your actual dataframes)
df_waves = {
    "wave4": df_wave4,
    "wave5": df_wave5,
    "wave6": df_wave6,
    "wave7": df_wave7
}

# Dictionary to store results
lda_results = {}

# Loop through each wave
for wave, df in df_waves.items():
    print(f"Processing {wave}...")

    # Drop 'country' and 'year' for LDA
    lda_data = df.drop(columns=["country", "year"])

    # Initialize LDA model
    lda_model = LatentDirichletAllocation(
        n_components=num_topics,
        doc_topic_prior=0.25,
        topic_word_prior=0.1,
        learning_method='online',
        learning_decay=0.7,
        learning_offset=10.0,
        max_iter=50,
        batch_size=1000,
        evaluate_every=-1,
        mean_change_tol=0.001,
        max_doc_update_iter=300,
        n_jobs=-1,
        random_state=25
    )

    # Fit LDA model and transform data
    lda_matrix = lda_model.fit_transform(lda_data)

    # Extract Topic-Feature Importance
    feature_names = lda_data.columns
    topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

    # Normalize importance scores
    topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
    topic_words = topic_words.T  # Transpose for readability
    topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]




    # Store results in dictionary
    lda_results[wave] = topic_words

    # Save results as CSV
    topic_words.to_csv(f"../reports/lda_results_{wave}.csv")

    print(f"{wave} LDA results saved!")

print("✅ All waves processed!")

Processing wave4...
wave4 LDA results saved!
Processing wave5...


KeyboardInterrupt: 

### Wave 4

In [11]:
# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_wave4.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups (you can tweak num_topics as needed)
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]



### Wave 5

In [None]:
# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_wave5.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups (you can tweak num_topics as needed)
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]

### Wave 6

In [None]:
# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_wave6.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups (you can tweak num_topics as needed)
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]

### Wave 7

In [None]:
# Prepare Data for LDA (Remove Country & Year for now)
lda_data = df_wave7.drop(columns=["country", "year"])

# Fit LDA with 10 Ideological Groups (you can tweak num_topics as needed)
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior = 0.25 ,
                                    topic_word_prior = 0.1 ,
                                    learning_method='online', # online updating not batch faster
                                    learning_decay=0.7, # how soon parameters are forgotten
                                    learning_offset=10.0, #downweights early learning steppts
                                    max_iter=50, # max number of iterations default 10 (iterations in M step)
                                    batch_size=1000, #size of batch to use
                                    evaluate_every=-1, # evaluate perplexity -1 is off
                                    mean_change_tol=0.001, # stopping tolerance for updating in E-step
                                    max_doc_update_iter=300, # maximum number of iterations in E-step (iterations over batch)
                                    n_jobs=-1, #number of cpu to use
                                    random_state=25) #random state, original was in 42 
lda_matrix = lda_model.fit_transform(lda_data)

# Extract Topic-Feature Importance
feature_names = lda_data.columns
topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

# Normalize the Importance Scores
topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
topic_words = topic_words.T  # Transpose for better visualization
topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]

In [None]:
# Display Top Issues for Each Ideology
top_issues = topic_words.apply(lambda x: x.nlargest(5).index.tolist(), axis=0)
top_issues

# Save the descriptive DataFrame to a CSV file
top_issues.to_csv('top_issues.csv', index=False)

Unnamed: 0,Ideology_1,Ideology_2,Ideology_3,Ideology_4,Ideology_5
0,A124_07_support,F119_oppose,F121_support,F115_support,A124_06_support
1,A124_09_support,F123_oppose,A124_02_oppose,F114A_support,A124_02_support
2,A124_08_support,F120_oppose,F117_oppose,F116_support,F119_oppose
3,F118_oppose,A124_02_oppose,A124_06_oppose,F121_support,F123_oppose
4,F119_oppose,A124_06_oppose,A124_09_oppose,F120_support,F117_oppose


In [None]:
import pandas as pd
import json

# Load the dictionary from the JSON file
with open('variable_dict.json', 'r') as f:
    variable_dict = json.load(f)

# Read the CSV file with the top issues
top_issues = pd.read_csv('top_issues.csv')

# Replace codes with their dictionary description (keeps the '_support' or '_oppose' component)
def replace_with_description(issue_codes):
    # Split by last underscore to separate the base code from the suffix
    return [f"{variable_dict.get(code.rsplit('_', 1)[0], f'Description Not Found: {code}')}{'_' + code.split('_')[-1]}" for code in issue_codes]

# Apply the function to each row of the 'top_issues' DataFrame
top_issues_descriptive = top_issues.apply(lambda row: replace_with_description(row), axis=0)

# Display the DataFrame with descriptive labels
print(top_issues_descriptive)

                              Ideology_1  \
0  People with AIDS as neighbors_support   
1       Homosexuals as neighbors_support   
2      Drug addicts as neighbors_support   
3    Homosexuality – justifiable?_oppose   
4     Prostitution – justifiable?_oppose   

                                       Ideology_2  \
0              Prostitution – justifiable?_oppose   
1                   Suicide – justifiable?_oppose   
2                  Abortion – justifiable?_oppose   
3              Different race as neighbors_oppose   
4  Immigrants foreign workers as neighbors_oppose   

                                        Ideology_3  \
0                   Divorce – justifiable?_support   
1               Different race as neighbors_oppose   
2  Someone accepting a bribe – justifiable?_oppose   
3   Immigrants foreign workers as neighbors_oppose   
4                  Homosexuals as neighbors_oppose   

                                          Ideology_4  \
0  Avoiding a fare on public transp

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# Load your encoded dataset (df_encoded should be numeric)
df_encoded = pd.read_csv("your_encoded_data.csv")

# Define the number of ideological types (topics) to test
n_components_range = range(1, 11)  # Testing from 1 to 10 ideological types
k_folds = 10  # Number of folds

# Set up cross-validation
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Function to compute topic cohesion
def topic_cohesion_score(topic_distributions, df_original):
    """
    Measures how often top issue positions appear together in the original dataset.
    Higher values indicate better cohesion.
    """
    scores = []
    
    for topic in topic_distributions:
        # Get the top issues for the topic
        top_issues = topic.argsort()[-5:]  # Top 5 most important features
        co_occurrence = df_original.iloc[:, top_issues].mean().mean()  # Average co-occurrence
        scores.append(co_occurrence)
    
    return np.mean(scores)

# Store the results
best_cohesion = -np.inf
best_n_components = None
best_lda_model = None

# Iterate over different numbers of ideological types
for n_components in n_components_range:
    cohesion_scores = []
    
    for train_index, test_index in kf.split(df_encoded):
        train_data, test_data = df_encoded.iloc[train_index], df_encoded.iloc[test_index]

        # Scale the data (LDA performs better with standardized inputs)
        scaler = StandardScaler()
        train_data_scaled = scaler.fit_transform(train_data)
        test_data_scaled = scaler.transform(test_data)

        # Train LDA model
        lda = LatentDirichletAllocation(
            n_components=n_components, 
            learning_method='batch',  # Use batch for stability
            max_iter=10, 
            random_state=42
        )
        lda.fit(train_data_scaled)
        
        # Get topic distributions for the test set
        test_topic_distributions = lda.transform(test_data_scaled)

        # Compute topic cohesion
        cohesion = topic_cohesion_score(lda.components_, df_encoded)
        cohesion_scores.append(cohesion)

    # Average cohesion score across all folds
    avg_cohesion = np.mean(cohesion_scores)

    # Save the best model
    if avg_cohesion > best_cohesion:
        best_cohesion = avg_cohesion
        best_n_components = n_components
        best_lda_model = lda

# Print the best model parameters
print(f"Best Number of Ideological Types: {best_n_components}")
print(f"Best Topic Cohesion Score: {best_cohesion}")

# Save the best LDA model
import joblib
joblib.dump(best_lda_model, "best_lda_model.pkl")

## Try this next time

In [None]:
import pandas as pd
import os
import json
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold

# **SETTINGS**
num_topics = 10
num_folds = 10
data_folder = "data"
reports_folder = "reports"

# Ensure reports folder exists
os.makedirs(reports_folder, exist_ok=True)

# Load the dictionary for variable descriptions
with open(os.path.join(data_folder, 'variable_dic.json'), 'r') as f:
    variable_dict = json.load(f)

# Function to process each wave
def process_wave(wave_number):
    # Load wave data
    file_path = os.path.join(data_folder, f"wave{wave_number}_data.csv")
    df = pd.read_csv(file_path)
    
    # Drop country and year
    data = df.drop(columns=["country", "year"])
    
    # 10-Fold Cross-Validation (Train on First Fold)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=25)
    for train_idx, test_idx in kf.split(data):
        train_data, test_data = data.iloc[train_idx], data.iloc[test_idx]
        break  # Use only first fold for training

    # Train LDA Model
    lda_model = LatentDirichletAllocation(
        n_components=num_topics,
        doc_topic_prior=0.25,
        topic_word_prior=0.1,
        learning_method='online',
        learning_decay=0.7,
        learning_offset=10.0,
        max_iter=50,
        batch_size=1000,
        evaluate_every=-1,
        mean_change_tol=0.001,
        max_doc_update_iter=300,
        n_jobs=-1,
        random_state=25
    )
    lda_matrix = lda_model.fit_transform(train_data)

    # Extract topic-feature importance
    feature_names = data.columns
    topic_words = pd.DataFrame(lda_model.components_, columns=feature_names)

    # Normalize Importance Scores
    topic_words = topic_words.div(topic_words.sum(axis=1), axis=0)
    topic_words = topic_words.T  # Transpose for better visualization
    topic_words.columns = [f"Ideology_{i+1}" for i in range(num_topics)]

    # Extract Top 10 Issues per Ideology
    top_issues = topic_words.apply(lambda x: x.nlargest(10).index.tolist(), axis=0)

    # Replace issue codes with descriptions
    def replace_with_description(issue_codes):
        return [variable_dict.get(code.split('_')[0], code) for code in issue_codes]

    top_issues_descriptive = top_issues.apply(replace_with_description, axis=0)

    # Save to reports folder
    output_file = os.path.join(reports_folder, f"top_issues_wave{wave_number}.csv")
    top_issues_descriptive.to_csv(output_file, index=False)
    print(f"✅ Saved: {output_file}")

# **Run for waves 4 through 7**
for wave in range(4, 8):
    process_wave(wave)

print("🎉 All waves processed successfully!")