# Latent Dirichlet Allocation

Following Mirko Draca and Carlo Schwarz selection of WVS questions and waves

In [None]:
#pip install -r requirements.txt

In [1]:
import os
import pandas as pd

df_wvs = pd.read_csv('../master_thesis_R/wvs_ts_w1_w7.csv')

In [None]:
import pandas as pd

def recode_survey_responses(df, question_columns, neutral_values={3, 5}):
    """
    Following Draca and Schwarz (2024) methodology, this function recodes World Value Survey responses into two indicator variables (support and oppose), 
    imputes missing values with the sample mean of the non-missing data in the same wave,
    and calculates the share of positive and negative responses grouped by country and year.
    
    Parameters:
    df (pd.DataFrame): The original World Value Survey DataFrame (many waves in this df)
    question_columns (list): List of columns we wish to transform.
    neutral_values (set): Values that represent neutrality, default is {3, 5}. 
    
    Returns:
    pd.DataFrame: A transformed DataFrame with support/oppose indicators for each country and years.
    """
    
    new_df = df_wvs.rename(columns={"COUNTRY_ALPHA": "country", "S020": "year"}).copy()
    
    # Impute missing values (negative values) with the sample mean of non-missing data
    for col in question_columns:
        valid_values = new_df[new_df[col] >= 0][col]  # Exclude negative values (missing data)
        mean_value = valid_values.mean()
        new_df[col] = new_df[col].apply(lambda x: mean_value if x < 0 else x)
    
    # Creating support and oppose indicators
    for col in question_columns:
        support_col = new_df[col].apply(lambda x: 1 if x not in neutral_values and x > min(neutral_values) else 0)
        oppose_col = new_df[col].apply(lambda x: 1 if x not in neutral_values and x < min(neutral_values) else 0)
        
        new_df[f"{col}_support"] = support_col
        new_df[f"{col}_oppose"] = oppose_col
    
    # Group by 'country' and 'year' and calculate the share of support and oppose
    grouped_df = new_df.groupby(['country', 'year'])[list(f"{col}_support" for col in question_columns) + list(f"{col}_oppose" for col in question_columns)].mean().reset_index()

    return grouped_df

# Example Usage:
# Assuming survey data is loaded into a DataFrame 'df' with question columns listed in 'question_columns'



In [24]:
## Define the columns to be recoded
question_columns = ["A124_02", "A124_06", "A124_07", "A124_08", "A124_09", "C002", "E036", "E037", "E039", "F114A", "F115", "F116", "F117", "F118", "F119", "F120", "F121", "F122", "F123"]
    
recode_survey_responses(df_wvs, question_columns)

Unnamed: 0,country,year,A124_02_support,A124_06_support,A124_07_support,A124_08_support,A124_09_support,C002_support,E036_support,E037_support,...,F114A_oppose,F115_oppose,F116_oppose,F117_oppose,F118_oppose,F119_oppose,F120_oppose,F121_oppose,F122_oppose,F123_oppose
0,ALB,1998,0.0,0.0,0.0,0.0,0.0,0.0,0.377377,0.794795,...,0.214214,0.716717,0.706707,0.665666,0.713714,0.747748,0.071071,0.073073,0.484484,0.818819
1,ALB,2002,0.0,0.0,0.0,0.0,0.0,0.0,0.382000,0.752000,...,0.777000,0.670000,0.780000,0.680000,0.848000,0.905000,0.319000,0.235000,0.556000,0.832000
2,AND,2005,0.0,0.0,0.0,0.0,0.0,0.0,0.486540,0.715852,...,0.659023,0.621137,0.695912,0.906281,0.070788,0.222333,0.101695,0.030907,0.094716,0.480558
3,AND,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.459163,0.749004,...,0.771912,0.787849,0.845618,0.934263,0.070717,0.318725,0.196215,0.048805,0.168327,0.511952
4,ARG,1984,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,1.000000,...,0.802985,0.749254,0.752239,0.905473,0.624876,0.793035,0.475622,0.283582,0.714428,0.889552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,ZAF,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.693854,0.572359,...,0.422260,0.393373,0.442084,0.432739,0.353724,0.439536,0.405834,0.285188,0.309827,0.438969
303,ZMB,2007,0.0,0.0,0.0,0.0,0.0,0.0,0.658667,0.690000,...,0.554667,0.519333,0.494000,0.528667,0.652000,0.554667,0.550000,0.334000,0.445333,0.653333
304,ZWE,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.360279,0.756487,...,0.864271,0.898204,0.884232,0.948104,0.967066,0.959082,0.935130,0.775449,0.916168,0.956088
305,ZWE,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.444000,0.806000,...,0.639333,0.668667,0.661333,0.675333,0.820667,0.744667,0.757333,0.512000,0.000000,0.752667


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_support_trends(df, question_code, countries):
    """
    Plots the support share trends for a given question across different years for selected countries.
    
    Parameters:
    df (pd.DataFrame): The transformed DataFrame with support/oppose shares grouped by country and year.
    question_code (str): The survey question to visualize (e.g., 'A124_02').
    countries (list): List of countries to include in the plot.
    """
    plt.figure(figsize=(12, 6))
    
    # Filter for selected countries
    df_filtered = df[df['country'].isin(countries)]
    
    # Plot the support share
    sns.lineplot(data=df_filtered, x='year', y=f"{question_code}_support", hue='country', marker="o")
    
    plt.title(f"Support Share Trends for {question_code}")
    plt.xlabel("Year")
    plt.ylabel("Share of Support")
    plt.legend(title="Country")
    plt.grid(True)
    plt.show()

# Example Usage:
# Select some countries and a question to visualize
selected_countries = ["USA", "Germany", "France"]
plot_support_trends(grouped_df, "A124_02", selected_countries)