# Latent Dirichlet Allocation

Following Mirko Draca and Carlo Schwarz selection of WVS questions and waves

In [None]:
#pip install -r requirements.txt

In [1]:
import os
import pandas as pd

df_wvs = pd.read_csv('../master_thesis_R/wvs_ts_w1_w7.csv')

In [None]:
import pandas as pd

def recode_survey_responses(df, question_columns, neutral_values={3, 5}):
    """
    Following Mirko and Schwarz methodology, this function recodes World Value Survey responses into two indicator variables (support and oppose), 
    imputes missing values with the sample mean of the non-missing data in the same wave,
    and calculates the share of positive and negative responses.
    
    Parameters:
    df (pd.DataFrame): The original World Value Survey DataFrame (many waves in this df)
    question_columns (list): List of columns we wish to transform.
    neutral_values (set): Values that represent neutrality, default is {3, 5}. 
    
    Returns:
    pd.DataFrame: A transformed DataFrame with support/oppose indicators for each country and years.
    """
    
    new_df = df_wvs.rename(columns={"COUNTRY_ALPHA": "country", "S020": "year"}).copy()  

 # Impute missing values (negative values) with the sample mean of non-missing data
    for col in question_columns:
        valid_values = new_df[new_df[col] >= 0][col]  # Exclude negative values (missing data)
        mean_value = valid_values.mean()
        new_df[col] = new_df[col].apply(lambda x: mean_value if x < 0 else x)
    
    share_data = {"country": new_df["country"], "year": new_df["year"]}
    
    for col in question_columns:
        # Create support and oppose indicator variables
        support_col = new_df[col].apply(lambda x: 1 if x not in neutral_values and x > min(neutral_values) else 0)
        oppose_col = new_df[col].apply(lambda x: 1 if x not in neutral_values and x < min(neutral_values) else 0)
        
        # Calculate share of support and oppose
        total_responses = len(new_df[col].dropna())
        support_share = support_col.sum() / total_responses if total_responses > 0 else 0
        oppose_share = oppose_col.sum() / total_responses if total_responses > 0 else 0
        
        share_data[f"{col}_support"] = support_share
        share_data[f"{col}_oppose"] = oppose_share
    
    return pd.DataFrame([share_data])

# Example Usage:
# Assuming survey data is loaded into a DataFrame 'df' with question columns listed in 'question_columns'



In [None]:
## Define the columns to be recoded
question_columns = ["A124_02", "A124_06", "A124_07", "A124_08", "A124_09", "C002", "E036", "E037", "E039", "F114A", "F115", "F116", "F117", "F118", "F119", "F120", "F121", "F122", "F123"]
    
recode_survey_responses(df_wvs, question_columns)