In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_task_temporal_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score,
    add_rating_delta
)
from eda.statistical_tests import (
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    perform_welch_ttests_on_proportions,
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents|
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
def aggregate_user_level_proportions(
    df: pd.DataFrame,
    feature_set: list,
    group_var: str = "wonky_study_count",
    user_id_var: str = "respondentPk",
) -> pd.DataFrame:
    """
    Aggregate user x task data to user-level proportions.
    
    For each user, calculates the proportion of tasks where each binary feature = 1.
    This is the KEY step that restores independence.
    
    Parameters:
    -----------
    df : pd.DataFrame
        User x task level data
    feature_set : List[str]
        Binary features to aggregate
    group_var : str
        Group indicator (wonky_study_count)
    user_id_var : str
        User identifier column
    
    Returns:
    --------
    pd.DataFrame
        One row per user with their proportion for each feature
    """
    # Prepare group variable
    df_prep = df.copy()
    df_prep[group_var] = df_prep[group_var].fillna(0)
    df_prep["group_binary"] = (df_prep[group_var] > 0).astype(int)
    
    # Build list of columns to aggregate
    cols_to_aggregate = ['group_binary']
    for feature in feature_set:
        if feature in df_prep.columns:
            cols_to_aggregate.append(feature)
    
    # Select only needed columns
    df_subset = df_prep[[user_id_var] + cols_to_aggregate].copy()
    
    # Aggregate: mean for proportions
    user_df = df_subset.groupby(user_id_var, as_index=False).mean()
    
    # Add task count separately
    task_counts = df_prep.groupby(user_id_var, as_index=False).size()
    task_counts.columns = [user_id_var, 'n_tasks']
    
    # Merge
    user_df = user_df.merge(task_counts, on=user_id_var, how='left')
    
    return user_df


In [0]:
feature_set = stats_config['feature_sets']['temporal_features']

In [0]:
user_df = aggregate_user_level_proportions(
    df=user_info_df,
    feature_set=feature_set
    )

In [0]:
user_df

In [0]:
# Test the fixed function
print("Testing aggregation function...")

user_df = aggregate_user_level_proportions(
    df=user_info_df,
    feature_set=feature_set,
    group_var='wonky_study_count',
    user_id_var='respondentPk'
)

print(f"\nOriginal data: {len(user_info_df)} rows (user x task)")
print(f"Aggregated data: {len(user_df)} rows (unique users)")
print(f"\nFirst few users:")
print(user_df.head())

print(f"\nTask count distribution:")
print(user_df['n_tasks'].describe())

print(f"\nGroup distribution:")
print(user_df['group_binary'].value_counts())
