Exec Summary

-
-
-
-
-

In [0]:
%pip install pyyaml>=6.0 -q

In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    compare_demographic_groups
)

from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
user_info_df = user_info_df.merge(wonky_respondent_df[['balance_respondentPk', 'task_pk', 'wonky_study_count']], left_on=['balance_respondentPk', 'taskPk'], right_on=['balance_respondentPk', 'task_pk'], how='left')

In [0]:
wonky_respondent_summary.display()

In [0]:
wonky_respondent_df

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
wonky_counts

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering

In [0]:
main_features = []

### Behavioural Stuff

#### Days active before task

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
user_info_df["days_active_before_task"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task'].value_counts()

In [0]:
user_info_df_shortened = user_info_df[['respondentPk', 'days_active_before_task', 'wonky_study_count']]

In [0]:
series = user_info_df_shortened['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df_shortened = user_info_df_shortened.join(daysactive_dummies)

In [0]:
user_info_df_shortened

##### Test

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df_shortened,
    feature_set=daysactive_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
daysactive_cols

In [0]:
main_features += list(daysactive_cols)
main_features += ['days_active_before_task']
main_features

#### Temporal Feature Analysis & Breakdowns - STRONG HYPOTHESIS

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_task_instances'].unique()

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
temporal_features = [
    "is_weekday",
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=temporal_features,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

##### Tests for Temporal Features

Testing independence between temporal features and wonky study participation.
Chi-squared test determines if temporal patterns differ significantly between wonky and non-wonky groups.


In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(user_info_df, temporal_features, group_var='wonky_study_count')
mannwhitney_results

In [0]:
mannwhitney_results

is_business_hour cles >> someone randomly picked from this group 55% chance they could produce a wonky study. ROUGH INTERPRETATION

In [0]:
welch_results = perform_welch_ttests(user_info_df, temporal_features, group_var='wonky_study_count')
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(user_info_df, temporal_features, group_var='wonky_task_instances')
ztest_results

**Strong significance for all subsets but wednesday and friday although this doesn't taken into consideration directionality -> simple separators are business hours**

In [0]:
main_features += list(temporal_features)

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

In [0]:
# # Visualize chi-squared statistics
# if len(chi_square_results) > 0:
#     fig_chi2 = create_chi_squared_bar_chart(
#         chi_square_results,
#         chi2_col='chi2',
#         p_value_col='chi_p_value',
#         significance_level=0.01,
#         title="Chi-Squared Statistic by Temporal Feature"
#     )
#     fig_chi2.show()
# else:
#     print("No chi-squared test results available for visualization")


In [0]:
# if len(chi_square_results) > 0:
#     delta_results = calculate_temporal_feature_deltas(
#       user_info_df,
#       temporal_features=temporal_features,
#       group_col='wonky_task_instances',  
#       group_threshold=0 
# )
    
#     if len(delta_results) > 0:
#         fig_dual = create_chi_squared_delta_dual_axis_chart(
#             chi_square_results,
#             delta_results,
#             chi2_col='chi2',
#             p_value_col='chi_p_value',
#             delta_col='delta_pct',
#             significance_level=0.01,
#             title="Chi-Squared Statistic and Delta % by Temporal Feature"
#         )
#         fig_dual.show()
#     else:
#         print("No delta results available for visualization")
# else:
#     print("No chi-squared test results available for visualization")

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

#### Task speed features - OKAY HYPOTHESIS

In [0]:
# capping because of very anomalous time throwing off the average

user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s_capped",
    use_std_dev=True
)

mean_time = user_info_df["task_time_taken_s_capped"].mean()
std_time = user_info_df["task_time_taken_s_capped"].std()
print(f"Task time statistics:")
print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
print()

# Display breakdown with wonky vs non-wonky comparison
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col='wonky_task_instances',
    group_threshold=0
))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

##### Tests

In [0]:
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += speed_features
main_features

Suspiciously slow unanimously a signal of non wonky_participant

TODO -> Come up with good Viz across test

#### Task and Point categroy - WEAK HYPOTHESIS

In [0]:
# user_info_df['defined_task_category'] = user_info_df['taskCategory'].astype(str) + "_points_" + user_info_df['payoutPoints'].astype(str)

# # Import the function
# from eda.statistical_tests import compare_task_category_wonky_rates

# # Create the summary table directly from user_info_df
# # Why: Analyzes wonky rates by task category to identify which categories have higher fraud rates
# category_wonky_summary = compare_task_category_wonky_rates(
#     df=user_info_df,
#     category_col="defined_task_category",
#     respondent_id_col="respondentPk",
#     wonky_col="wonky_study_count"
# )

# # Display the results
# print("Wonky Rates by Task Category:")
# print("=" * 100)
# display(category_wonky_summary)

# # Format for better readability
# category_wonky_summary_formatted = category_wonky_summary.copy()
# category_wonky_summary_formatted['wonky_pct'] = category_wonky_summary_formatted['wonky_pct'].round(2)
# category_wonky_summary_formatted['non_wonky_pct'] = category_wonky_summary_formatted['non_wonky_pct'].round(2)
# category_wonky_summary_formatted['proportion_delta'] = category_wonky_summary_formatted['proportion_delta'].round(2)

# display(category_wonky_summary_formatted)

#### Device

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=hardware_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

huge delta for desktop, but this might be a sample thing. may converge better with 3-6 months of data.

###### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=hardware_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['Desktop', 'Iphone']
main_features

#### Platform

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=platform_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

##### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=platform_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['Linux', 'iOS']
main_features

### Demographic stuff

#### Gambling

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=gambling_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

###### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=gambling_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:

welch_results = perform_welch_ttests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['do not gamble at all']
main_features

#### Income

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns

user_info_df = user_info_df.join(income_dummies)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['fulcrum_household_income_mapped'].value_counts()

most wonky studies are in lower income groups

##### Test

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += list(income_cols)
main_features

In [0]:
len(main_features)

#### Income Gender

In [0]:
main_features += ['gender']
main_features

In [0]:
user_info_df['gender_fulcrum_household_income_mapped'] = user_info_df['gender'].astype(str) + "_" + user_info_df['fulcrum_household_income_mapped'].astype(str)

In [0]:
series = user_info_df['gender_fulcrum_household_income_mapped']

# One-hot encode each gambling mode
gender_income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_income_cols = gender_income_dummies.columns

user_info_df = user_info_df.join(gender_income_dummies)

In [0]:
user_info_df["gender_fulcrum_household_income_mapped"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['gender_fulcrum_household_income_mapped'].value_counts()

##### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=gender_income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
ztest_results

#### Exposure bands

In [0]:
wonky_respondent_df['exposure_band'].value_counts()/len(wonky_respondent_df)

### Respondent_Features

In [0]:
# Create respondent-level behavioral features
import importlib

if "eda.feature_engineering" in sys.modules:
    importlib.reload(sys.modules["eda.feature_engineering"])
    from eda.feature_engineering import create_respondent_behavioral_features

respondent_features = create_respondent_behavioral_features(
    user_info_df,
    respondent_id_col="respondentPk",
    date_col="date_completed",
    config={
        "high_volume_percentile": feature_config["volume_thresholds"][
            "high_volume_percentile"
        ],
        "extreme_volume_percentile": feature_config["volume_thresholds"][
            "extreme_volume_percentile"
        ],
        "velocity_bins": feature_config["velocity_bins"],
        "velocity_labels": feature_config["velocity_labels"],
    },
    categorical_cols= main_features,
)

print(f"Aggregated to {respondent_features.shape[0]:,} respondents")
print(f"Avg tasks per respondent: {respondent_features['total_tasks'].mean():.2f}")

In [0]:
respondent_features['days_active_all_tasks'] = (respondent_features['last_task_date'] - respondent_features['first_task_date']).dt.days

In [0]:
# Add wonky features using modular function
respondent_features = add_wonky_features(
    respondent_features, wonky_counts, respondent_id_col="respondentPk"
)

print(f"Wonky features added")
print(
    f"Respondents with wonky tasks: {(respondent_features['wonky_task_ratio'] > 0).sum():,}"
)
print(
    f"High wonky concentration (>50%): {respondent_features['is_high_wonky'].sum():,}"
)

In [0]:
respondent_features.sort_values('respondentPk')

In [0]:
break

In [0]:
user_info_df[user_info_df['respondentPk'] == 'e043970a-f4a2-4a33-afdd-eccb63ec8271']

In [0]:
respondent_features

In [0]:
for col in gambling_cols:
  respondent_features[col] = respondent_features[col].astype(int)

In [0]:
respondent_features.display()

In [0]:
df = respondent_features.copy()

In [0]:
# Visualize target variable distributions
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('wonky_study_count Distribution', 'wonky_task_instances Distribution',
                    'wonky_study_count (Non-zero)', 'wonky_task_instances (Non-zero)'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Histogram for wonky_study_count
fig.add_trace(
    go.Histogram(x=df['wonky_study_count'], nbinsx=20, name='wonky_study_count'),
    row=1, col=1
)

# Histogram for wonky_task_instances
fig.add_trace(
    go.Histogram(x=df['wonky_task_instances'], nbinsx=20, name='wonky_task_instances'),
    row=1, col=2
)

# Non-zero values for wonky_study_count
df_nonzero_study = df[df['wonky_study_count'] > 0]['wonky_study_count']
if len(df_nonzero_study) > 0:
    fig.add_trace(
        go.Histogram(x=df_nonzero_study, nbinsx=20, name='wonky_study_count (non-zero)'),
        row=2, col=1
    )

# Non-zero values for wonky_task_instances
df_nonzero_task = df[df['wonky_task_instances'] > 0]['wonky_task_instances']
if len(df_nonzero_task) > 0:
    fig.add_trace(
        go.Histogram(x=df_nonzero_task, nbinsx=20, name='wonky_task_instances (non-zero)'),
        row=2, col=2
    )

fig.update_layout(
    height=800,
    title_text="Target Variable Distributions",
    showlegend=False
)

fig.update_xaxes(title_text="Count", row=1, col=1)
fig.update_xaxes(title_text="Instances", row=1, col=2)
fig.update_xaxes(title_text="Count", row=2, col=1)
fig.update_xaxes(title_text="Instances", row=2, col=2)

fig.show()

In [0]:
# Identify numeric columns (exclude IDs, dates, and target variables)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = ['respondentPk', 'has_wonky_study', 'has_wonky_task', 
                'wonky_study_count', 'wonky_task_instances']
numeric_cols = [col for col in numeric_cols if col not in exclude_cols]

# Calculate correlations with target variables
# Include target variables separately to avoid duplicates
corr_study_count = df[numeric_cols + ['wonky_study_count']].corr()['wonky_study_count'].sort_values(ascending=False)
corr_task_instances = df[numeric_cols + ['wonky_task_instances']].corr()['wonky_task_instances'].sort_values(ascending=False)

print("=" * 80)
print("TOP CORRELATIONS WITH wonky_study_count")
print("=" * 80)
print("\nTop 20 positive correlations:")
display(corr_study_count.head(21)[1:])  # Exclude self-correlation
print("\nTop 20 negative correlations:")
display(corr_study_count.tail(20))

In [0]:
print("=" * 80)
print("TOP CORRELATIONS WITH corr_task_instances")
print("=" * 80)
print("\nTop 20 positive correlations:")
display(corr_task_instances.head(21)[1:])  # Exclude self-correlation
print("\nTop 20 negative correlations:")
display(corr_task_instances.tail(20))

In [0]:
# Visualize top correlations
top_n = 15

# For wonky_study_count
top_pos_study = corr_study_count.head(top_n + 1)[1:]  # Exclude self
top_neg_study = corr_study_count.tail(top_n)

# For wonky_task_instances
top_pos_task = corr_task_instances.head(top_n + 1)[1:]  # Exclude self
top_neg_task = corr_task_instances.tail(top_n)

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Top Positive Correlations: wonky_study_count',
                    'Top Negative Correlations: wonky_study_count',
                    'Top Positive Correlations: wonky_task_instances',
                    'Top Negative Correlations: wonky_task_instances'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}],
           [{"secondary_y": False}, {"secondary_y": False}]]
)

# Positive correlations for wonky_study_count
if len(top_pos_study) > 0:
    fig.add_trace(
        go.Bar(x=top_pos_study.values, y=top_pos_study.index, orientation='h', 
               marker_color='green', name='Positive'),
        row=1, col=1
    )

# Negative correlations for wonky_study_count
if len(top_neg_study) > 0:
    fig.add_trace(
        go.Bar(x=top_neg_study.values, y=top_neg_study.index, orientation='h',
               marker_color='red', name='Negative'),
        row=1, col=2
    )

# Positive correlations for wonky_task_instances
if len(top_pos_task) > 0:
    fig.add_trace(
        go.Bar(x=top_pos_task.values, y=top_pos_task.index, orientation='h',
               marker_color='green', name='Positive'),
        row=2, col=1
    )

# Negative correlations for wonky_task_instances
if len(top_neg_task) > 0:
    fig.add_trace(
        go.Bar(x=top_neg_task.values, y=top_neg_task.index, orientation='h',
               marker_color='red', name='Negative'),
        row=2, col=2
    )

fig.update_layout(
    height=1000,
    title_text="Top Correlations with Target Variables",
    showlegend=False
)

fig.update_xaxes(title_text="Correlation", row=1, col=1)
fig.update_xaxes(title_text="Correlation", row=1, col=2)
fig.update_xaxes(title_text="Correlation", row=2, col=1)
fig.update_xaxes(title_text="Correlation", row=2, col=2)

fig.show()

#### Stat tests

In [0]:
# Select features for statistical testing (exclude target variables and IDs)
test_features = [col for col in numeric_cols 
                 if col not in ['wonky_study_count', 'wonky_task_instances', 
                               'wonky_unique_tasks', 'total_wonky_studies',
                               'wonky_task_ratio', 'wonky_concentration', 
                               'wonky_diversity', 'is_high_wonky', 'is_quite_wonky',
                               'wonky_study_flag']]

print(f"Testing {len(test_features)} features")
print(f"\nSample features: {test_features[:10]}")

In [0]:
# Statistical tests for wonky_study_count (binary: has_wonky_study)
print("=" * 80)
print("STATISTICAL TESTS: Features vs has_wonky_study")
print("=" * 80)
print("\nComparing respondents with wonky studies (1) vs without (0)")

stats_results_study = compare_groups_statistically(
    df=df,
    group_col='has_wonky_study',
    metrics=test_features,
    group1_value=1,
    group2_value=0,
    significance_level=0.05,
    include_welch=True
)

# Sort by p-value
stats_results_study = stats_results_study.sort_values('mw_p_value')

print(f"\n✓ Completed tests on {len(stats_results_study)} features")
print(f"\nSignificant features (p < 0.05): {stats_results_study['mw_significant'].sum()}")

display(stats_results_study.head(20))

In [0]:
# Statistical tests for wonky_task_instances (binary: has_wonky_task)
print("=" * 80)
print("STATISTICAL TESTS: Features vs has_wonky_task")
print("=" * 80)
print("\nComparing respondents with wonky tasks (1) vs without (0)")

stats_results_task = compare_groups_statistically(
    df=df,
    group_col='has_wonky_task',
    metrics=test_features,
    group1_value=1,
    group2_value=0,
    significance_level=0.05,
    include_welch=True
)

# Sort by p-value
stats_results_task = stats_results_task.sort_values('mw_p_value')

print(f"\n✓ Completed tests on {len(stats_results_task)} features")
print(f"\nSignificant features (p < 0.05): {stats_results_task['mw_significant'].sum()}")

display(stats_results_task.head(20))

In [0]:
# Visualize statistical test results
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('Statistical Significance: has_wonky_study',
                    'Statistical Significance: has_wonky_task'),
    specs=[[{"secondary_y": False}, {"secondary_y": False}]]
)

# Top significant features for wonky_study
# Check if DataFrame exists, is not empty, and has the required column
if (len(stats_results_study) > 0 and 
    'mw_significant' in stats_results_study.columns and 
    'mw_p_value' in stats_results_study.columns and
    'metric' in stats_results_study.columns):
    top_sig_study = stats_results_study[stats_results_study['mw_significant']].head(15)
    if len(top_sig_study) > 0:
        fig.add_trace(
            go.Bar(x=-np.log10(top_sig_study['mw_p_value']), 
                   y=top_sig_study['metric'],
                   orientation='h',
                   marker_color='steelblue',
                   name='wonky_study'),
            row=1, col=1
        )
else:
    print("Warning: stats_results_study is empty or missing required columns")

# Top significant features for wonky_task
# Check if DataFrame exists, is not empty, and has the required column
if (len(stats_results_task) > 0 and 
    'mw_significant' in stats_results_task.columns and 
    'mw_p_value' in stats_results_task.columns and
    'metric' in stats_results_task.columns):
    top_sig_task = stats_results_task[stats_results_task['mw_significant']].head(15)
    if len(top_sig_task) > 0:
        fig.add_trace(
            go.Bar(x=-np.log10(top_sig_task['mw_p_value']),
                   y=top_sig_task['metric'],
                   orientation='h',
                   marker_color='coral',
                   name='wonky_task'),
            row=1, col=2
        )
else:
    print("Warning: stats_results_task is empty or missing required columns")

fig.update_layout(
    height=600,
    title_text="Top Significant Features (-log10 p-value)",
    showlegend=False
)

fig.update_xaxes(title_text="-log10(p-value)", row=1, col=1)
fig.update_xaxes(title_text="-log10(p-value)", row=1, col=2)

# Add significance line
fig.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="red", 
              annotation_text="p=0.05", row=1, col=1)
fig.add_hline(y=-np.log10(0.05), line_dash="dash", line_color="red",
              annotation_text="p=0.05", row=1, col=2)

fig.show()

In [0]:
# Get top significant features for visualization
top_features_study = []
top_features_task = []

if (len(stats_results_study) > 0 and 
    'mw_significant' in stats_results_study.columns and 
    'metric' in stats_results_study.columns):
    top_features_study = stats_results_study[stats_results_study['mw_significant']].head(6)['metric'].tolist()

if (len(stats_results_task) > 0 and 
    'mw_significant' in stats_results_task.columns and 
    'metric' in stats_results_task.columns):
    top_features_task = stats_results_task[stats_results_task['mw_significant']].head(6)['metric'].tolist()

print(f"Top features for wonky_study analysis: {top_features_study}")
print(f"Top features for wonky_task analysis: {top_features_task}")

In [0]:
# Box plots comparing distributions for wonky_study
print("=" * 80)
print("DISTRIBUTION COMPARISONS: has_wonky_study")
print("=" * 80)

# Use significant features if available, otherwise use top correlated features
if len(top_features_study) == 0:
    print("No significant features found. Using top correlated features instead...")
    if len(corr_study_count) > 0:
        top_features_study = corr_study_count.head(7)[1:].index.tolist()[:6]  # Exclude self, get top 6
        print(f"Using top correlated features: {top_features_study}")

if len(top_features_study) > 0:
    for feature in top_features_study[:6]:
        if feature in df.columns:
            try:
                # Check if feature has valid numeric data
                if df[feature].dtype not in [np.int64, np.float64]:
                    print(f"Skipping {feature}: not numeric")
                    continue
                
                # Remove NaN values for plotting
                plot_df = df[['has_wonky_study', feature]].dropna()
                if len(plot_df) == 0:
                    print(f"Skipping {feature}: no valid data after removing NaNs")
                    continue
                
                fig = create_box_plot(
                    df=plot_df,
                    x='has_wonky_study',
                    y=feature,
                    title=f'{feature} by Wonky Study Status',
                    labels={'has_wonky_study': 'Has Wonky Study', feature: feature}
                )
                fig.update_xaxes(tickmode='linear', tick0=0, dtick=1)
                fig.update_layout(height=500)
                fig.show()
            except Exception as e:
                print(f"Error creating plot for {feature}: {e}")
                continue
else:
    print("No features available for plotting")

In [0]:
# Box plots comparing distributions for wonky_task
print("=" * 80)
print("DISTRIBUTION COMPARISONS: has_wonky_task")
print("=" * 80)

# Use significant features if available, otherwise use top correlated features
if len(top_features_task) == 0:
    print("No significant features found. Using top correlated features instead...")
    if len(corr_task_instances) > 0:
        top_features_task = corr_task_instances.head(7)[1:].index.tolist()[:6]  # Exclude self, get top 6
        print(f"Using top correlated features: {top_features_task}")

if len(top_features_task) > 0:
    for feature in top_features_task[:6]:
        if feature in df.columns:
            try:
                # Check if feature has valid numeric data
                if df[feature].dtype not in [np.int64, np.float64]:
                    print(f"Skipping {feature}: not numeric")
                    continue
                
                # Remove NaN values for plotting
                plot_df = df[['has_wonky_task', feature]].dropna()
                if len(plot_df) == 0:
                    print(f"Skipping {feature}: no valid data after removing NaNs")
                    continue
                
                fig = create_box_plot(
                    df=plot_df,
                    x='has_wonky_task',
                    y=feature,
                    title=f'{feature} by Wonky Task Status',
                    labels={'has_wonky_task': 'Has Wonky Task', feature: feature}
                )
                fig.update_xaxes(tickmode='linear', tick0=0, dtick=1)
                fig.update_layout(height=500)
                fig.show()
            except Exception as e:
                print(f"Error creating plot for {feature}: {e}")
                continue
else:
    print("No features available for plotting")

In [0]:
# Scatter plots for top correlated features vs target variables
top_corr_features_study = corr_study_count.head(6)[1:].index.tolist()  # Exclude self
top_corr_features_task = corr_task_instances.head(6)[1:].index.tolist()  # Exclude self

print("Scatter plots: Features vs wonky_study_count")
for feature in top_corr_features_study[:4]:
    if feature in df.columns and df[feature].dtype in [np.int64, np.float64]:
        fig = create_scatter_plot(
            df=df,
            x=feature,
            y='wonky_study_count',
            title=f'{feature} vs wonky_study_count',
            labels={feature: feature, 'wonky_study_count': 'Wonky Study Count'}
        )
        fig.show()

In [0]:
print("Scatter plots: Features vs wonky_task_instances")
for feature in top_corr_features_task[:4]:
    if feature in df.columns and df[feature].dtype in [np.int64, np.float64]:
        fig = create_scatter_plot(
            df=df,
            x=feature,
            y='wonky_task_instances',
            title=f'{feature} vs wonky_task_instances',
            labels={feature: feature, 'wonky_task_instances': 'Wonky Task Instances'}
        )
        fig.show()

In [0]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'respondentPk']

# Also include low-cardinality numeric columns that might be categorical
low_cardinality_numeric = [col for col in numeric_cols 
                           if df[col].nunique() < 20 and col not in ['has_wonky_study', 'has_wonky_task']]

print(f"Categorical columns: {len(categorical_cols)}")
print(f"Low cardinality numeric (potential categorical): {len(low_cardinality_numeric)}")
print(f"\nCategorical columns: {categorical_cols}")
print(f"\nLow cardinality numeric: {low_cardinality_numeric[:10]}")

In [0]:
# Analyze categorical features vs wonky_study_count
print("=" * 80)
print("CATEGORICAL FEATURES vs wonky_study_count")
print("=" * 80)

for col in categorical_cols[:10]:  # Limit to first 10 for brevity
    if col in df.columns:
        print(f"\n{col}:")
        crosstab = pd.crosstab(df[col], df['has_wonky_study'], margins=True)
        print(crosstab)
        
        # Calculate percentages
        pct_crosstab = pd.crosstab(df[col], df['has_wonky_study'], normalize='index') * 100
        print("\nPercentages (by row):")
        print(pct_crosstab.round(2))
        print("-" * 80)

In [0]:
# Analyze categorical features vs wonky_task_instances
print("=" * 80)
print("CATEGORICAL FEATURES vs wonky_task_instances")
print("=" * 80)

for col in categorical_cols[:10]:  # Limit to first 10 for brevity
    if col in df.columns:
        print(f"\n{col}:")
        crosstab = pd.crosstab(df[col], df['has_wonky_task'], margins=True)
        print(crosstab)
        
        # Calculate percentages
        pct_crosstab = pd.crosstab(df[col], df['has_wonky_task'], normalize='index') * 100
        print("\nPercentages (by row):")
        print(pct_crosstab.round(2))
        print("-" * 80)

In [0]:
# Perform chi-squared tests for categorical features
# Convert low cardinality numeric to categorical for testing
test_categorical = categorical_cols + low_cardinality_numeric[:10]

# Filter to features with reasonable number of categories
test_categorical = [col for col in test_categorical 
                   if col in df.columns and df[col].nunique() > 1 and df[col].nunique() < 50]

print(f"Testing {len(test_categorical)} categorical features")

# Chi-squared tests for wonky_study
chi2_results_study = perform_chi_square_tests(
    df=df,
    feature_set=test_categorical,
    group_var='has_wonky_study',
    significance_level=0.05
)

if len(chi2_results_study) > 0:
    print("\nChi-squared test results for has_wonky_study:")
    chi2_results_study = chi2_results_study.sort_values('chi_p_value')
    display(chi2_results_study.head(15))

In [0]:
# Chi-squared tests for wonky_task
chi2_results_task = perform_chi_square_tests(
    df=df,
    feature_set=test_categorical,
    group_var='has_wonky_task',
    significance_level=0.05
)

if len(chi2_results_task) > 0:
    print("\nChi-squared test results for has_wonky_task:")
    chi2_results_task = chi2_results_task.sort_values('chi_p_value')
    display(chi2_results_task.head(15))

#### Feature importance summary

In [0]:
# Combine correlation and statistical test results for wonky_study_count
print("=" * 80)
print("FEATURE IMPORTANCE SUMMARY: wonky_study_count")
print("=" * 80)

if len(stats_results_study) > 0 and 'metric' in stats_results_study.columns:
    # Merge correlation and statistical test results
    importance_study = stats_results_study.merge(
        corr_study_count.reset_index().rename(columns={'index': 'metric', 'wonky_study_count': 'correlation'}),
        on='metric',
        how='left'
    )
    
    # Calculate importance score (combination of correlation magnitude and statistical significance)
    if 'mw_p_value' in importance_study.columns:
        importance_study['importance_score'] = (
            np.abs(importance_study['correlation']) * 
            (-np.log10(importance_study['mw_p_value'] + 1e-10))
        )
        importance_study = importance_study.sort_values('importance_score', ascending=False)
        
        # Select columns that exist
        display_cols = ['metric', 'correlation', 'mean_difference', 'importance_score']
        if 'mw_p_value' in importance_study.columns:
            display_cols.insert(2, 'mw_p_value')
        if 'mw_significant' in importance_study.columns:
            display_cols.insert(3, 'mw_significant')
        
        print("\nTop 20 Most Important Features:")
        display(importance_study[[col for col in display_cols if col in importance_study.columns]].head(20))
    else:
        print("Warning: mw_p_value column not found in stats_results_study")
else:
    print("Warning: stats_results_study is empty or missing required columns")

In [0]:
# Combine correlation and statistical test results for wonky_task_instances
print("=" * 80)
print("FEATURE IMPORTANCE SUMMARY: wonky_task_instances")
print("=" * 80)

if len(stats_results_task) > 0 and 'metric' in stats_results_task.columns:
    # Merge correlation and statistical test results
    importance_task = stats_results_task.merge(
        corr_task_instances.reset_index().rename(columns={'index': 'metric', 'wonky_task_instances': 'correlation'}),
        on='metric',
        how='left'
    )
    
    # Calculate importance score
    if 'mw_p_value' in importance_task.columns:
        importance_task['importance_score'] = (
            np.abs(importance_task['correlation']) * 
            (-np.log10(importance_task['mw_p_value'] + 1e-10))
        )
        importance_task = importance_task.sort_values('importance_score', ascending=False)
        
        # Select columns that exist
        display_cols = ['metric', 'correlation', 'mean_difference', 'importance_score']
        if 'mw_p_value' in importance_task.columns:
            display_cols.insert(2, 'mw_p_value')
        if 'mw_significant' in importance_task.columns:
            display_cols.insert(3, 'mw_significant')
        
        print("\nTop 20 Most Important Features:")
        display(importance_task[[col for col in display_cols if col in importance_task.columns]].head(20))
    else:
        print("Warning: mw_p_value column not found in stats_results_task")
else:
    print("Warning: stats_results_task is empty or missing required columns")

In [0]:
# Combine correlation and statistical test results for wonky_study_count
print("=" * 80)
print("FEATURE IMPORTANCE SUMMARY: wonky_study_count")
print("=" * 80)

if len(stats_results_study) > 0 and 'metric' in stats_results_study.columns:
    # Convert correlation Series to DataFrame properly
    if len(corr_study_count) > 0:
        corr_df = corr_study_count.to_frame(name='correlation').reset_index()
        corr_df.columns = ['metric', 'correlation']
        
        # Merge correlation and statistical test results
        importance_study = stats_results_study.merge(
            corr_df,
            on='metric',
            how='left'
        )
        
        print(f"Merged {len(importance_study)} features")
        print(f"Features with correlation data: {importance_study['correlation'].notna().sum()}")
        
        # Calculate importance score (combination of correlation magnitude and statistical significance)
        if 'mw_p_value' in importance_study.columns:
            # Fill NaN correlations with 0 for calculation
            importance_study['correlation'] = importance_study['correlation'].fillna(0)
            
            importance_study['importance_score'] = (
                np.abs(importance_study['correlation']) * 
                (-np.log10(importance_study['mw_p_value'] + 1e-10))
            )
            importance_study = importance_study.sort_values('importance_score', ascending=False)
            
            # Select columns that exist
            display_cols = ['metric', 'correlation', 'mean_difference', 'importance_score']
            if 'mw_p_value' in importance_study.columns:
                display_cols.insert(2, 'mw_p_value')
            if 'mw_significant' in importance_study.columns:
                display_cols.insert(3, 'mw_significant')
            
            print("\nTop 20 Most Important Features:")
            result_df = importance_study[[col for col in display_cols if col in importance_study.columns]].head(20)
            if len(result_df) > 0:
                display(result_df)
            else:
                print("No results to display")
        else:
            print("Warning: mw_p_value column not found in stats_results_study")
            print(f"Available columns: {list(stats_results_study.columns)}")
    else:
        print("Warning: corr_study_count is empty")
else:
    print("Warning: stats_results_study is empty or missing required columns")
    if len(stats_results_study) == 0:
        print("stats_results_study is empty")
    elif 'metric' not in stats_results_study.columns:
        print(f"Available columns: {list(stats_results_study.columns)}")

### Inconsitency flags