Exec Summary

-
-
-
-
-

In [0]:
%pip install pyyaml>=6.0 -q

In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    compare_demographic_groups
)

from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
user_info_df = user_info_df.merge(wonky_respondent_df[['balance_respondentPk', 'task_pk', 'wonky_study_count']], left_on=['balance_respondentPk', 'taskPk'], right_on=['balance_respondentPk', 'task_pk'], how='left')

In [0]:
wonky_respondent_summary.display()

In [0]:
wonky_respondent_df

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
wonky_counts

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering

In [0]:
main_features = []

### Behavioural Stuff

#### Days active before task

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
user_info_df["days_active_before_task"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task'].value_counts()

In [0]:
user_info_df_shortened = user_info_df[['respondentPk', 'days_active_before_task', 'wonky_study_count']]

In [0]:
series = user_info_df_shortened['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df_shortened = user_info_df_shortened.join(daysactive_dummies)

In [0]:
user_info_df_shortened

##### Test

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df_shortened,
    feature_set=daysactive_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
daysactive_cols

In [0]:
main_features += list(daysactive_cols)
main_features += ['days_active_before_task']
main_features

#### Temporal Feature Analysis & Breakdowns - STRONG HYPOTHESIS

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_task_instances'].unique()

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
temporal_features = [
    "is_weekday",
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=temporal_features,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

##### Tests for Temporal Features

Testing independence between temporal features and wonky study participation.
Chi-squared test determines if temporal patterns differ significantly between wonky and non-wonky groups.


In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(user_info_df, temporal_features, group_var='wonky_study_count')
mannwhitney_results

In [0]:
mannwhitney_results

is_business_hour cles >> someone randomly picked from this group 55% chance they could produce a wonky study. ROUGH INTERPRETATION

In [0]:
welch_results = perform_welch_ttests(user_info_df, temporal_features, group_var='wonky_study_count')
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(user_info_df, temporal_features, group_var='wonky_task_instances')
ztest_results

**Strong significance for all subsets but wednesday and friday although this doesn't taken into consideration directionality -> simple separators are business hours**

In [0]:
main_features += list(temporal_features)

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

In [0]:
# # Visualize chi-squared statistics
# if len(chi_square_results) > 0:
#     fig_chi2 = create_chi_squared_bar_chart(
#         chi_square_results,
#         chi2_col='chi2',
#         p_value_col='chi_p_value',
#         significance_level=0.01,
#         title="Chi-Squared Statistic by Temporal Feature"
#     )
#     fig_chi2.show()
# else:
#     print("No chi-squared test results available for visualization")


In [0]:
# if len(chi_square_results) > 0:
#     delta_results = calculate_temporal_feature_deltas(
#       user_info_df,
#       temporal_features=temporal_features,
#       group_col='wonky_task_instances',  
#       group_threshold=0 
# )
    
#     if len(delta_results) > 0:
#         fig_dual = create_chi_squared_delta_dual_axis_chart(
#             chi_square_results,
#             delta_results,
#             chi2_col='chi2',
#             p_value_col='chi_p_value',
#             delta_col='delta_pct',
#             significance_level=0.01,
#             title="Chi-Squared Statistic and Delta % by Temporal Feature"
#         )
#         fig_dual.show()
#     else:
#         print("No delta results available for visualization")
# else:
#     print("No chi-squared test results available for visualization")

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

#### Task speed features - OKAY HYPOTHESIS

In [0]:
# capping because of very anomalous time throwing off the average

user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s_capped",
    use_std_dev=True
)

mean_time = user_info_df["task_time_taken_s_capped"].mean()
std_time = user_info_df["task_time_taken_s_capped"].std()
print(f"Task time statistics:")
print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
print()

# Display breakdown with wonky vs non-wonky comparison
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col='wonky_task_instances',
    group_threshold=0
))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

##### Tests

In [0]:
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += speed_features
main_features

Suspiciously slow unanimously a signal of non wonky_participant

TODO -> Come up with good Viz across test

#### Task and Point categroy - WEAK HYPOTHESIS

In [0]:
# user_info_df['defined_task_category'] = user_info_df['taskCategory'].astype(str) + "_points_" + user_info_df['payoutPoints'].astype(str)

# # Import the function
# from eda.statistical_tests import compare_task_category_wonky_rates

# # Create the summary table directly from user_info_df
# # Why: Analyzes wonky rates by task category to identify which categories have higher fraud rates
# category_wonky_summary = compare_task_category_wonky_rates(
#     df=user_info_df,
#     category_col="defined_task_category",
#     respondent_id_col="respondentPk",
#     wonky_col="wonky_study_count"
# )

# # Display the results
# print("Wonky Rates by Task Category:")
# print("=" * 100)
# display(category_wonky_summary)

# # Format for better readability
# category_wonky_summary_formatted = category_wonky_summary.copy()
# category_wonky_summary_formatted['wonky_pct'] = category_wonky_summary_formatted['wonky_pct'].round(2)
# category_wonky_summary_formatted['non_wonky_pct'] = category_wonky_summary_formatted['non_wonky_pct'].round(2)
# category_wonky_summary_formatted['proportion_delta'] = category_wonky_summary_formatted['proportion_delta'].round(2)

# display(category_wonky_summary_formatted)

#### Device

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=hardware_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

huge delta for desktop, but this might be a sample thing. may converge better with 3-6 months of data.

###### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=hardware_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['Desktop', 'Iphone']
main_features

#### Platform

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=platform_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

##### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=platform_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['Linux', 'iOS']
main_features

### Demographic stuff

#### Gambling

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=gambling_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

###### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=gambling_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:

welch_results = perform_welch_ttests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += ['do not gamble at all']
main_features

#### Income

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns

user_info_df = user_info_df.join(income_dummies)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['fulcrum_household_income_mapped'].value_counts()

most wonky studies are in lower income groups

##### Test

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
ztest_results

In [0]:
main_features += list(income_cols)
main_features

In [0]:
len(main_features)

#### Income Gender

In [0]:
main_features += ['gender']
main_features

In [0]:
user_info_df['gender_fulcrum_household_income_mapped'] = user_info_df['gender'].astype(str) + "_" + user_info_df['fulcrum_household_income_mapped'].astype(str)

In [0]:
series = user_info_df['gender_fulcrum_household_income_mapped']

# One-hot encode each gambling mode
gender_income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_income_cols = gender_income_dummies.columns

user_info_df = user_info_df.join(gender_income_dummies)

In [0]:
user_info_df["gender_fulcrum_household_income_mapped"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['gender_fulcrum_household_income_mapped'].value_counts()

strong cut off category of female less than 15k

##### Tests

In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=gender_income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

chi_square_results

In [0]:
mannwhitney_results = perform_mannwhitney_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
mannwhitney_results

In [0]:
welch_results = perform_welch_ttests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
welch_results

In [0]:
ztest_results = perform_two_proportion_z_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
ztest_results

#### Exposure bands

In [0]:
wonky_respondent_df['exposure_band'].value_counts()/len(wonky_respondent_df)

### Respondent_Features

In [0]:
# Create respondent-level behavioral features
import importlib

if "eda.feature_engineering" in sys.modules:
    importlib.reload(sys.modules["eda.feature_engineering"])
    from eda.feature_engineering import create_respondent_behavioral_features

respondent_features = create_respondent_behavioral_features(
    user_info_df,
    respondent_id_col="respondentPk",
    date_col="date_completed",
    config={
        "high_volume_percentile": feature_config["volume_thresholds"][
            "high_volume_percentile"
        ],
        "extreme_volume_percentile": feature_config["volume_thresholds"][
            "extreme_volume_percentile"
        ],
        "velocity_bins": feature_config["velocity_bins"],
        "velocity_labels": feature_config["velocity_labels"],
    },
    categorical_cols= main_features,
)

print(f"Aggregated to {respondent_features.shape[0]:,} respondents")
print(f"Avg tasks per respondent: {respondent_features['total_tasks'].mean():.2f}")

In [0]:
respondent_features['days_active_all_tasks'] = (respondent_features['last_task_date'] - respondent_features['first_task_date']).dt.days

In [0]:
# Add wonky features using modular function
respondent_features = add_wonky_features(
    respondent_features, wonky_counts, respondent_id_col="respondentPk"
)

print(f"Wonky features added")
print(
    f"Respondents with wonky tasks: {(respondent_features['wonky_task_ratio'] > 0).sum():,}"
)
print(
    f"High wonky concentration (>50%): {respondent_features['is_high_wonky'].sum():,}"
)

In [0]:
respondent_features.sort_values('respondentPk')

In [0]:
respondent_feature

In [0]:
respondent_features[['respondentPk', 'wonky_study_count']].set_index('respondentPk').value_counts()

In [0]:
welch_results = perform_welch_ttests(
    respondent_features, income_cols, group_var="wonky_study_count", significance_level=0.05
)
welch_results

In [0]:
respondent_features