# WVS Wave 7 Data Analysis - Aggregation Stage

- This script aggregates the imputed survey data based on specified demographic profiles.
- It calculates the mode (most frequent response) for each survey question within each unique demographic group.

In [None]:
# =============================================================================
# Imports
# =============================================================================
import pathlib
import warnings
import pandas as pd
import numpy as np
from scipy import stats

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='pandas') # Ignore potential pandas warnings


In [None]:
# =============================================================================
# Configuration / Constants
# =============================================================================
DATA_DIR = pathlib.Path("./data")

# --- Input Data ---
# IMPUTED_DATA_PATH = DATA_DIR / "wvs_wave7_imputed_final.csv" # From MICE with custom estimators
IMPUTED_DATA_PATH = DATA_DIR / "wvs_wave7_imputed.csv" # From default IterativeImputer (Chosen based on performance)

# --- Output Data ---
AGGREGATED_OUTPUT_CSV_PATH = DATA_DIR / "new_mode_wvs_wave7_aggregated_by_demographics.csv"

In [None]:
# --- Aggregation Parameters ---
# Define demographic variables used for grouping during aggregation
# Should match variables used for grouping in the imputation stage ideally
AGGREGATION_DEMOGRAPHIC_VARS = [
    'B_COUNTRY_ALPHA', # country
    'H_URBRURAL',      # urban / rural
    # 'Q273',          # marital status
    'Q260',            # sex
    'X003R2',          # age (3-cat)
    'Q275R',           # education (3 groups)
    # 'Q279',          # employment
]

# Aggregation function to use ('mode', 'median')
AGGREGATION_METHOD = 'mode'

In [None]:
# =============================================================================
# Function Definitions
# =============================================================================

def load_imputed_data(filepath: pathlib.Path) -> pd.DataFrame:
    """Loads the imputed data CSV file."""
    print(f"Loading imputed data from: {filepath}")
    if not filepath.exists():
        raise FileNotFoundError(
            f"Error: Imputed data file not found at '{filepath}'. "
            "Please ensure the imputation script ran successfully and saved the file."
        )
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded imputed data shape: {df.shape}")
    except Exception as e:
        raise IOError(f"Error reading imputed data file '{filepath}': {e}")

    # Validate that required demographic columns exist
    missing_req_cols = [col for col in AGGREGATION_DEMOGRAPHIC_VARS if col not in df.columns]
    if missing_req_cols:
         raise ValueError(f"Error: Required demographic columns for aggregation missing from imputed data: {missing_req_cols}")
    return df

def identify_columns(df: pd.DataFrame, demographic_vars: list[str]) -> tuple[list[str], list[str]]:
    """Separates demographic variables from survey question columns."""
    print("Identifying demographic and survey columns...")
    all_columns = df.columns.tolist()
    # To ensure demographic vars provided are actually in the dataframe columns
    valid_demographic_vars = [var for var in demographic_vars if var in all_columns]
    if len(valid_demographic_vars) < len(demographic_vars):
        print(f"Warning: Not all specified demographic vars found in data. Using: {valid_demographic_vars}")

    survey_question_cols = [
        col for col in all_columns if col not in valid_demographic_vars
    ]
    print(f"Found {len(valid_demographic_vars)} demographic variables.")
    print(f"Found {len(survey_question_cols)} survey question columns.")

    if not survey_question_cols:
        raise ValueError("Error: No survey question columns identified. Check demographic variable list.")

    return valid_demographic_vars, survey_question_cols

def aggregate_data_by_mode(df: pd.DataFrame, group_by_cols: list[str], aggregate_cols: list[str]) -> pd.DataFrame:
    """
    Aggregates survey responses by demographic groups using the mode (most frequent value).
    Handles potential multiple modes by taking the first one.
    """
    print(f"Aggregating {len(aggregate_cols)} survey columns by {len(group_by_cols)} demographic variables using mode...")
    if not group_by_cols:
         raise ValueError("Error: No columns specified for grouping.")
    if not aggregate_cols:
         raise ValueError("Error: No columns specified for aggregation.")

    # Define the mode function
    def get_first_mode(x):
        modes = x.mode()
        # Return first mode if modes exist otherwise return NaN
        return modes.iloc[0] if not modes.empty else np.nan

    try:
        # Group by the demographic variables
        grouped = df.groupby(group_by_cols, observed=False, dropna=True) # Handle NaNs in grouping keys if needed

        # Apply the mode aggregation to the survey columns
        aggregated_data = grouped[aggregate_cols].apply(lambda x: x.apply(get_first_mode, axis=0), include_groups=False).reset_index()
        # Alternative is agg (lambda worked so ignoring this for now)
        # aggregated_data = grouped[aggregate_cols].agg(get_first_mode).reset_index()

        print(f"Aggregation complete. Reduced dataset from {len(df):,} rows to {len(aggregated_data):,} rows (unique demographic profiles).")

    except Exception as e:
        print(f"Error during aggregation: {e}")
        raise RuntimeError(f"Aggregation failed: {e}")

    return aggregated_data

def save_aggregated_data(df: pd.DataFrame, filepath: pathlib.Path):
    """Saves the aggregated DataFrame to a CSV file."""
    print(f"\nSaving aggregated data to: {filepath}")
    try:
        # Ensure directory exists
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)
        print(f"Successfully saved {filepath} ({df.shape[0]:,} rows × {df.shape[1]} cols)")
    except Exception as e:
        print(f"Error saving aggregated data to {filepath}: {e}")

In [None]:
# =============================================================================
# Main Execution Workflow
# =============================================================================

def main():
    """Main function to orchestrate the data aggregation pipeline."""
    print("--- Starting WVS Data Aggregation Pipeline ---")

    try:
        # 1. Load Imputed Data
        df_imputed = load_imputed_data(IMPUTED_DATA_PATH)

        # # After load_imputed_data:
        # Dealt with this mess already but going to keep it for reference
        # columns_to_drop = ['H_URBRURAL'] # Add any other columns to drop entirely
        # df_imputed = df_imputed.drop(columns=[col for col in columns_to_drop if col in df_imputed.columns])

        # 2. Identify Columns for Grouping and Aggregation
        demographic_cols, survey_cols = identify_columns(df_imputed, AGGREGATION_DEMOGRAPHIC_VARS)

        # 3. Aggregate Data (Using Mode)
        if AGGREGATION_METHOD == 'mode':
            df_aggregated = aggregate_data_by_mode(df_imputed, demographic_cols, survey_cols)

        # Separate experiment done next
        # elif AGGREGATION_METHOD == 'median':
        #     # Implement median aggregation if needed
        #     pass
        else:
            raise ValueError(f"Unsupported AGGREGATION_METHOD: {AGGREGATION_METHOD}")

        # 4. Display Head of Aggregated Data
        print("\nFirst 5 rows of aggregated data:")
        print(df_aggregated.head())

        # 5. Save Aggregated Data
        save_aggregated_data(df_aggregated, AGGREGATED_OUTPUT_CSV_PATH)

        print("\n--- WVS Data Aggregation Pipeline Finished ---")

    except FileNotFoundError as e:
        print(f"\nCritical Error: Input file not found.")
        print(e)
    except ValueError as e:
        print(f"\nCritical Error: Configuration or Data invalid.")
        print(e)
    except RuntimeError as e:
         print(f"\nCritical Error: Aggregation process failed.")
         print(e)
    except Exception as e:
        print(f"\nAn unexpected error occurred during the pipeline execution:")
        print(e)
        # import traceback
        # traceback.print_exc() # Uncomment for detailed error traceback

In [56]:
if __name__ == "__main__":
    main()

--- Starting WVS Data Aggregation Pipeline ---
Loading imputed data from: data\wvs_wave7_imputed.csv
Loaded imputed data shape: (97220, 102)
Identifying demographic and survey columns...
Found 5 demographic variables.
Found 97 survey question columns.
Aggregating 97 survey columns by 5 demographic variables using mode...
Aggregation complete. Reduced dataset from 97,220 rows to 2,250 rows (unique demographic profiles).

First 5 rows of aggregated data:
  B_COUNTRY_ALPHA  H_URBRURAL  Q260  X003R2  Q275R  Q106  Q107  Q108  Q109  \
0             AND         1.0   1.0     1.0    1.0  10.0   5.0  10.0   1.0   
1             AND         1.0   1.0     1.0    2.0   8.0   5.0   2.0   2.0   
2             AND         1.0   1.0     1.0    3.0   8.0   5.0   2.0   1.0   
3             AND         1.0   1.0     2.0    1.0  10.0   5.0   5.0   2.0   
4             AND         1.0   1.0     2.0    2.0   7.0   5.0   5.0   1.0   

   Q110  ...  Q69P  Q6P  Q70P  Q71P  Q72P  Q73P  Q77P  Q8P  Q90  Q9P  
0  

```python
--- Starting WVS Data Aggregation Pipeline ---
Loading imputed data from: data\wvs_wave7_imputed.csv
Loaded imputed data shape: (97220, 102)
Identifying demographic and survey columns...
Found 5 demographic variables.
Found 97 survey question columns.
Aggregating 97 survey columns by 5 demographic variables using mode...
Aggregation complete. Reduced dataset from 97,220 rows to 2,250 rows (unique demographic profiles).

First 5 rows of aggregated data:
  B_COUNTRY_ALPHA  H_URBRURAL  Q260  X003R2  Q275R  Q106  Q107  Q108  Q109  \
0             AND         1.0   1.0     1.0    1.0  10.0   5.0  10.0   1.0   
1             AND         1.0   1.0     1.0    2.0   8.0   5.0   2.0   2.0   
2             AND         1.0   1.0     1.0    3.0   8.0   5.0   2.0   1.0   
3             AND         1.0   1.0     2.0    1.0  10.0   5.0   5.0   2.0   
4             AND         1.0   1.0     2.0    2.0   7.0   5.0   5.0   1.0   

   Q110  ...  Q69P  Q6P  Q70P  Q71P  Q72P  Q73P  Q77P  Q8P  Q90  Q9P  
0  10.0  ...   3.0  1.0   2.0   1.0   1.0   2.0   1.0  0.0  1.0  0.0  
1   3.0  ...   3.0  1.0   3.0   2.0   2.0   2.0   3.0  0.0  2.0  0.0  
2   5.0  ...   3.0  1.0   3.0   3.0   2.0   3.0   3.0  0.0  2.0  0.0  
3   3.0  ...   3.0  1.0   3.0   2.0   1.0   2.0   3.0  0.0  5.0  1.0  
4   4.0  ...   3.0  1.0   3.0   3.0   2.0   2.0   2.0  0.0  2.0  0.0  

[5 rows x 102 columns]

Saving aggregated data to: data\new_mode_wvs_wave7_aggregated_by_demographics.csv
Successfully saved data\new_mode_wvs_wave7_aggregated_by_demographics.csv (2,250 rows × 102 cols)

--- WVS Data Aggregation Pipeline Finished ---
```

## Using Median

In [None]:
# =============================================================================
# Imports
# =============================================================================
import pathlib
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning, module='pandas') # Ignore potential pandas warnings


In [None]:
# =============================================================================
# Configuration / Constants
# =============================================================================
DATA_DIR = pathlib.Path("./data")

# --- Input Data ---
# IMPUTED_DATA_PATH = DATA_DIR / "wvs_wave7_imputed_final.csv" # From MICE with custom estimators
IMPUTED_DATA_PATH = DATA_DIR / "wvs_wave7_imputed.csv" # From default IterativeImputer

# --- Output Data ---
# Update output filename to reflect median usage
AGGREGATED_OUTPUT_CSV_PATH = DATA_DIR / "new_median_wvs_wave7_aggregated_by_demographics.csv"


In [None]:
# --- Aggregation Parameters ---
# Define demographic variables used for grouping during aggregation
# Should match variables used for grouping in the imputation stage ideally
AGGREGATION_DEMOGRAPHIC_VARS = [ # If removing any variables, ensure they are also updated in `columns_to_drop`
    'B_COUNTRY_ALPHA', # country
    'H_URBRURAL',      # urban / rural
    # 'Q273',          # marital status
    'Q260',            # sex
    'X003R2',          # age (3-cat)
    'Q275R',           # education (3 groups)
    # 'Q279',          # employment
]

# Aggregation function to use ('mode', 'median')
AGGREGATION_METHOD = 'median'

In [None]:
# =============================================================================
# Function Definitions
# =============================================================================

def load_imputed_data(filepath: pathlib.Path) -> pd.DataFrame:
    """Loads the imputed data CSV file."""
    print(f"Loading imputed data from: {filepath}")
    if not filepath.exists():
        raise FileNotFoundError(
            f"Error: Imputed data file not found at '{filepath}'. "
            "Please ensure the imputation script ran successfully and saved the file."
        )
    try:
        df = pd.read_csv(filepath)
        print(f"Loaded imputed data shape: {df.shape}")
    except Exception as e:
        raise IOError(f"Error reading imputed data file '{filepath}': {e}")

    # Validate that required demographic columns exist (after potential dropping in main)
    # We will validate against the final list of demo vars used in identify_columns
    missing_req_cols = [col for col in AGGREGATION_DEMOGRAPHIC_VARS if col not in df.columns]
    if missing_req_cols:
         raise ValueError(f"Error: Required demographic columns for aggregation missing from imputed data: {missing_req_cols}")
    return df

def identify_columns(df: pd.DataFrame, demographic_vars: list[str]) -> tuple[list[str], list[str]]:
    """Separates demographic variables from survey question columns."""
    print("Identifying demographic and survey columns...")
    all_columns = df.columns.tolist()

    # Ensure demographic vars provided are actually in the dataframe columns *after* any drops
    valid_demographic_vars = [var for var in demographic_vars if var in all_columns]
    if len(valid_demographic_vars) < len(demographic_vars):
        # This check happens *after* potential drops in main(), so it's okay if some are missing now
        print(f"Warning: Using these demographic vars found in data for grouping: {valid_demographic_vars}")
        # Update the demographic_vars list to only include those present
        demographic_vars[:] = valid_demographic_vars


    survey_question_cols = [
        col for col in all_columns if col not in valid_demographic_vars
    ]
    print(f"Found {len(valid_demographic_vars)} demographic variables for grouping.")
    print(f"Found {len(survey_question_cols)} survey question columns for aggregation.")

    if not survey_question_cols:
        raise ValueError("Error: No survey question columns identified. Check demographic variable list and dropped columns.")
    if not valid_demographic_vars:
        raise ValueError("Error: No demographic columns identified for grouping. Check demographic variable list and dropped columns.")


    return valid_demographic_vars, survey_question_cols

# --- NEW FUNCTION FOR MEDIAN ---
def aggregate_data_by_median(df: pd.DataFrame, group_by_cols: list[str], aggregate_cols: list[str]) -> pd.DataFrame:
    """
    Aggregates survey responses by demographic groups using the median value.
    Assumes aggregate_cols contain numeric or ordinal data where median is meaningful.
    """
    print(f"Aggregating {len(aggregate_cols)} survey columns by {len(group_by_cols)} demographic variables using median...")
    if not group_by_cols:
         raise ValueError("Error: No columns specified for grouping.")
    if not aggregate_cols:
         raise ValueError("Error: No columns specified for aggregation.")

    # To ensure aggregation columns are numeric (median)
    non_numeric_cols = df[aggregate_cols].select_dtypes(exclude=np.number).columns.tolist()
    if non_numeric_cols:
        print(f"Warning: The following columns intended for median aggregation are non-numeric and will likely result in NaNs: {non_numeric_cols}")
        # Previously used for taking care of errors:
        # 1. Raise an error: raise TypeError(f"Columns for median aggregation must be numeric. Found non-numeric: {non_numeric_cols}")
        # 2. Try to convert them: # df[non_numeric_cols] = df[non_numeric_cols].apply(pd.to_numeric, errors='coerce')
        # 3. Exclude them: # aggregate_cols = [col for col in aggregate_cols if col not in non_numeric_cols]

    try:
        # Group by the demographic variables
        grouped = df.groupby(group_by_cols, observed=False, dropna=True) # Handle NaNs in grouping keys if needed

        # Calculate the median for the specified survey columns within each group        
        aggregated_data = grouped[aggregate_cols].median().reset_index() # .median() automatically handles NaNs within each group's column by default (skips them)

        print(f"Aggregation complete. Reduced dataset from {len(df):,} rows to {len(aggregated_data):,} rows (unique demographic profiles).")

    except Exception as e:
        print(f"Error during median aggregation: {e}")
        raise RuntimeError(f"Median aggregation failed: {e}")

    return aggregated_data

def save_aggregated_data(df: pd.DataFrame, filepath: pathlib.Path):
    """Saves the aggregated DataFrame to a CSV file."""
    print(f"\nSaving aggregated data to: {filepath}")
    try:
        # Ensure directory exists
        filepath.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(filepath, index=False)
        print(f"Successfully saved {filepath} ({df.shape[0]:,} rows × {df.shape[1]} cols)")
    except Exception as e:
        print(f"Error saving aggregated data to {filepath}: {e}")

In [None]:
# =============================================================================
# Main Execution Workflow
# =============================================================================

def main():
    """Main function to orchestrate the data aggregation pipeline."""
    print(f"--- Starting WVS Data Aggregation Pipeline (Method: {AGGREGATION_METHOD}) ---")

    try:
        # 1. Load Imputed Data
        df_imputed = load_imputed_data(IMPUTED_DATA_PATH)

        # Make a copy of the configured demographic vars to pass to identify_columns
        # This list might be modified *within* identify_columns if some vars aren't found
        current_demographic_vars = AGGREGATION_DEMOGRAPHIC_VARS.copy()

        # 2. Identify Columns for Grouping and Aggregation
        # To validate that the demo vars actually exist *after* dropping
        demographic_cols, survey_cols = identify_columns(df_imputed, current_demographic_vars)


        # 3. Aggregate Data (Using configured method)
        if AGGREGATION_METHOD == 'median':
            df_aggregated = aggregate_data_by_median(df_imputed, demographic_cols, survey_cols)
        else:
            raise ValueError(f"Unsupported AGGREGATION_METHOD: {AGGREGATION_METHOD}. Choose 'median'.") # Or add back 'mode'/'mean' support

        # 4. Display Head of Aggregated Data (Optional)
        print("\nFirst 5 rows of aggregated data:")
        print(df_aggregated.head())

        # 5. Save Aggregated Data
        save_aggregated_data(df_aggregated, AGGREGATED_OUTPUT_CSV_PATH)

        print(f"\n--- WVS Data Aggregation Pipeline (Method: {AGGREGATION_METHOD}) Finished ---")

    except FileNotFoundError as e:
        print(f"\nCritical Error: Input file not found.")
        print(e)
    except ValueError as e:
        print(f"\nCritical Error: Configuration or Data invalid.")
        print(e)
    except RuntimeError as e:
        print(f"\nCritical Error: Aggregation process failed.")
        print(e)
    except NotImplementedError as e:
         print(f"\nCritical Error: Functionality not implemented.")
         print(e)
    except Exception as e:
        print(f"\nAn unexpected error occurred during the pipeline execution:")
        print(e)
        # import traceback
        # traceback.print_exc() # Uncomment for detailed error traceback

In [62]:
# %%
if __name__ == "__main__":
    main()

--- Starting WVS Data Aggregation Pipeline (Method: median) ---
Loading imputed data from: data\wvs_wave7_imputed.csv
Loaded imputed data shape: (97220, 102)
Identifying demographic and survey columns...
Found 5 demographic variables for grouping.
Found 97 survey question columns for aggregation.
Aggregating 97 survey columns by 5 demographic variables using median...
Aggregation complete. Reduced dataset from 97,220 rows to 2,250 rows (unique demographic profiles).

First 5 rows of aggregated data:
  B_COUNTRY_ALPHA  H_URBRURAL  Q260  X003R2  Q275R  Q106  Q107  Q108  Q109  \
0             AND         1.0   1.0     1.0    1.0   8.0   5.0   7.0   1.5   
1             AND         1.0   1.0     1.0    2.0   7.0   5.0   3.0   2.0   
2             AND         1.0   1.0     1.0    3.0   6.0   5.0   4.0   2.0   
3             AND         1.0   1.0     2.0    1.0   5.0   5.0   5.0   2.0   
4             AND         1.0   1.0     2.0    2.0   7.0   5.0   5.0   2.0   

   Q110  ...  Q69P  Q6P  Q

```python
--- Starting WVS Data Aggregation Pipeline (Method: median) ---
Loading imputed data from: data\wvs_wave7_imputed.csv
Loaded imputed data shape: (97220, 102)
Identifying demographic and survey columns...
Found 5 demographic variables for grouping.
Found 97 survey question columns for aggregation.
Aggregating 97 survey columns by 5 demographic variables using median...
Aggregation complete. Reduced dataset from 97,220 rows to 2,250 rows (unique demographic profiles).

First 5 rows of aggregated data:
  B_COUNTRY_ALPHA  H_URBRURAL  Q260  X003R2  Q275R  Q106  Q107  Q108  Q109  \
0             AND         1.0   1.0     1.0    1.0   8.0   5.0   7.0   1.5   
1             AND         1.0   1.0     1.0    2.0   7.0   5.0   3.0   2.0   
2             AND         1.0   1.0     1.0    3.0   6.0   5.0   4.0   2.0   
3             AND         1.0   1.0     2.0    1.0   5.0   5.0   5.0   2.0   
4             AND         1.0   1.0     2.0    2.0   7.0   5.0   5.0   2.0   

   Q110  ...  Q69P  Q6P  Q70P  Q71P  Q72P  Q73P  Q77P  Q8P  Q90  Q9P  
0   6.0  ...   3.0  1.5   2.0   1.5   1.0   2.0   2.0  0.0  5.0  0.5  
1   4.0  ...   3.0  1.0   3.0   2.0   2.0   2.0   3.0  0.0  5.0  0.0  
2   5.0  ...   3.0  1.0   3.0   3.0   2.0   2.0   2.0  0.0  4.0  0.0  
3   4.0  ...   3.0  2.0   3.0   2.0   2.0   2.0   3.0  0.0  4.0  1.0  
4   4.0  ...   3.0  2.0   3.0   2.0   2.0   2.0   2.0  0.0  4.0  0.0  

[5 rows x 102 columns]

Saving aggregated data to: data\new_median_wvs_wave7_aggregated_by_demographics.csv
Successfully saved data\new_median_wvs_wave7_aggregated_by_demographics.csv (2,250 rows × 102 cols)

--- WVS Data Aggregation Pipeline (Method: median) Finished ---
```