Exec Summary

-
-
-
-
-

In [0]:
%pip install pyyaml>=6.0 -q

In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    compare_demographic_groups
)

from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
wonky_respondent_df

In [0]:
user_info_df = user_info_df.merge(wonky_respondent_df[['balance_respondentPk', 'task_pk', 'wonky_study_count']], left_on=['balance_respondentPk', 'taskPk'], right_on=['balance_respondentPk', 'task_pk'], how='left')

In [0]:
wonky_respondent_summary.display()

In [0]:
wonky_respondent_df

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
wonky_counts

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering

In [0]:
main_features = []

### Behavioural Stuff

#### Days active before task

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
user_info_df["days_active_before_task"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task'].value_counts()

In [0]:
user_info_df_shortened = user_info_df[['respondentPk', 'days_active_before_task', 'wonky_study_count']]

In [0]:
series = user_info_df_shortened['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df_shortened = user_info_df_shortened.join(daysactive_dummies)

In [0]:
user_info_df_shortened

##### Test

In [0]:
chi_square_results_days_active = perform_chi_square_tests(
    user_info_df_shortened,
    feature_set=daysactive_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_days_active.reset_index())

In [0]:
chi_days_active_table = chi_square_results_days_active.reset_index()

In [0]:
chi_days_active_table['days'] = chi_days_active_table['feature'].str.split('_').str[-1]

In [0]:
chi_days_active_table[['days', 'chi2']]

In [0]:
chi_days_active_table[chi_days_active_table['significant']].sort_values('days')

In [0]:
fig = px.scatter(
    chi_days_active_table[chi_days_active_table['significant']].sort_values('days'),
    x="days",
    y="chi2",
    trendline="ols"
)

fig.update_layout(
    xaxis_title="Days",
    yaxis_title="Chi-square statistic"
)

fig.show()

not directional but shows magnitude of differences
 

huge 7&6 day spike low impact for 1 day and 62 seen as smallest impact suggest danger zone of 1 week and 2 and half months (ish)

In [0]:
# redundant due to specifically for continuous data. useful for respondent features further down.

# mannwhitney_results_days_active = perform_mannwhitney_tests(
#     user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_days_active.reset_index())

In [0]:
ztest_results_days_active = perform_two_proportion_z_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
display(ztest_results_days_active[ztest_results_days_active['significant']].reset_index())

Sorting by days active (days from first task to task complete) no clear pattern but 6 & 7 days look to be more risky -> users hitting 1 week mark potentially less risky with

also potential cyclical nature of risks wonky behaviour also shown in users hitting the 1 month mark.

safest zone is 1 day mark and 0 day mark.

higher numbers from 50+ tend to reflect safer zones too indicating better behaviour deep into tenure.

potential consideration >> intial engagement is good, 1 week risk, followed by 1 month-1.5month cyclical risk >> after 2.5 months a little safer

deprioritize users exactly at 6, 7 and 48 days. prioritise 0 and 1 dayers and investigate 7 specifically as that stands out.

In [0]:
ztest_days_active_table = ztest_results_days_active[ztest_results_days_active['significant']].reset_index()
ztest_days_active_table['days'] = ztest_days_active_table['feature'].str.split('_').str[-1]
ztest_days_active_table['days'] = ztest_days_active_table['days'].astype(int)
ztest_days_active_table = ztest_days_active_table.sort_values('days')
ztest_days_active_table['days'] = ztest_days_active_table['days'].astype(str)

In [0]:
df = ztest_days_active_table[['days', 'z_statistic', 'proportion_diff']]

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["days"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.7,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["days"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Days since first task_complete",
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff (%pp)", secondary_y=True)

fig.show()

In [0]:
main_features += list(daysactive_cols)
main_features += ['days_active_before_task']

#### Temporal Feature Analysis & Breakdowns - STRONG HYPOTHESIS

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_task_instances'].unique()

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
temporal_features = [
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=temporal_features,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

##### Tests for Temporal Features

Testing independence between temporal features and wonky study participation.
Chi-squared test determines if temporal patterns differ significantly between wonky and non-wonky groups.


In [0]:
sorted(user_info_df.columns)

In [0]:
user_info_df[temporal_features + ['wonky_study_count']]

In [0]:
chi_square_results_temporal_features = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_study_count',
    significance_level=0.01
)
display(chi_square_results_temporal_features.reset_index())

In [0]:
# mannwhitney_result_temporal_feature = perform_mannwhitney_tests(user_info_df, temporal_features, group_var='wonky_study_count')
# display(mannwhitney_result_temporal_feature.reset_index())

In [0]:
# welch_results_temporal_feature = perform_welch_ttests(user_info_df, temporal_features, group_var='wonky_study_count')
# display(welch_results_temporal_feature.reset_index())

In [0]:
ztest_results_temporal_feature = perform_two_proportion_z_tests(user_info_df, temporal_features, group_var='wonky_study_count')
display(ztest_results_temporal_feature[ztest_results_temporal_feature['significant']].reset_index())

In [0]:
df = ztest_results_temporal_feature.reset_index()

In [0]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()



going with chi squared and z test due to binary outcome

chi is good for magnitude (just association) and z test give directionality 

chi2 - strong association across all but biggest magnitude is friday and business hours; weekend and mid week less of a signal >> business hours x [friday, monday, thursday] could be indicative 

ztest - friday (huge effect), is business hour and is busines hour weekday largest impact for wonky studies. lower risk days are mondays, thursday and night.
- no strong signals from weekends and mid week (tuesday wednesday)

all pretty solid (largely due to large sample)

potential end of week rush effect for friday (earning targets, fatigue or rushing for beers)

friday shows 8pp difference pro wonky
business hours show 5-6pp difference pro wonky
- multi tasking with jobs or potentially using work devices (to investigate)
- monday, thursday dampen business hour effects.

business hour effect likely driven by friday effect

night time effect supported by chi2 and ztest 2.3pp difference

takeaway:
users start week with higher focus
mid week is pretty okay (nothing standing out too much)

may be an engagement & rushing problem with friday being main  culprit

things to consider: selection bias -> users complete surveys on these days at varying levels of engagement 
- responsible work focused people might do it on monday as a routine
- casual or rush people tend to do this on fridays
- therefore observe monday as safter and friday as less risky

In [0]:
import numpy as np
import pandas as pd
from scipy import stats

def analyze_selection_bias(df, day_safe_col='is_monday', day_risky_col='is_friday', min_tasks=5, wonky_col='wonky_study_count'):
    """
    Analyzes within-respondent variation to detect selection bias vs real day effects.
    
    Args:
        df: The user_info_df containing task-level data
        day_safe_col: The column name for the "Safe" day (e.g., 'is_monday')
        day_risky_col: The column name for the "Risky" day (e.g., 'is_friday')
        min_tasks: Minimum tasks PER DAY required to be included (default 5)
        wonky_col: The target variable (e.g., 'wonky_study_count')
    """
    
    print(f"--- Selection Bias Analysis: {day_safe_col} vs {day_risky_col} ---")
    
    # 1. Prepare working data
    work_df = df.copy()
    
    # Define day type for aggregation
    conditions = [
        work_df[day_safe_col] == 1,
        work_df[day_risky_col] == 1
    ]
    choices = ['safe_day', 'risky_day']
    work_df['analysis_day_type'] = np.select(conditions, choices, default='other')
    
    # Filter to only relevant rows
    work_df = work_df[work_df['analysis_day_type'] != 'other']
    
    # Create binary flag for rate calculation (did this task have ANY wonkiness?)
    work_df['is_wonky_event'] = (work_df[wonky_col] > 0).astype(int)
    
    # 2. Group by Respondent -> Calculate Rates
    # We sum the wonky events and count total tasks per day-type per user
    respondent_stats = work_df.groupby(['respondentPk', 'analysis_day_type']).agg(
        total_tasks=('respondentPk', 'count'),
        wonky_events=('is_wonky_event', 'sum')
    ).reset_index()
    
    respondent_stats['wonky_rate'] = respondent_stats['wonky_events'] / respondent_stats['total_tasks']
    
    # 3. Pivot to put Day 1 and Day 2 side-by-side for each user
    user_pivot = respondent_stats.pivot(
        index='respondentPk', 
        columns='analysis_day_type', 
        values=['wonky_rate', 'total_tasks']
    )
    
    # Flatten column names (e.g., 'wonky_rate_risky_day', 'total_tasks_safe_day')
    user_pivot.columns = [f'{col[0]}_{col[1]}' for col in user_pivot.columns]
    user_pivot = user_pivot.reset_index()
    
    # 4. Filter for Multi-Day Respondents (The "Control Group")
    # Users must have enough volume on BOTH days to be statistically useful
    valid_users = user_pivot[
        (user_pivot['total_tasks_safe_day'] >= min_tasks) & 
        (user_pivot['total_tasks_risky_day'] >= min_tasks)
    ].copy()
    
    n_users = len(valid_users)
    print(f"Found {n_users} respondents with >= {min_tasks} tasks on BOTH days.")
    
    if n_users < 10:
        print("Not enough users for robust statistical inference. Try lowering 'min_tasks'.")
        return None

    # 5. Calculate Within-Person Difference
    # Positive Diff means Risky Day is TRULY riskier for the same person
    valid_users['risk_diff'] = valid_users['wonky_rate_risky_day'] - valid_users['wonky_rate_safe_day']
    
    mean_safe_rate = valid_users['wonky_rate_safe_day'].mean()
    mean_risky_rate = valid_users['wonky_rate_risky_day'].mean()
    mean_diff = valid_users['risk_diff'].mean()
    
    # Paired T-Test (Is the difference statistically significant?)
    t_stat, p_val = stats.ttest_rel(valid_users['wonky_rate_risky_day'], valid_users['wonky_rate_safe_day'])
    
    print(f"\nRESULTS:")
    print(f"1. {day_safe_col} Average Wonky Rate: {mean_safe_rate:.2%}")
    print(f"2. {day_risky_col} Average Wonky Rate: {mean_risky_rate:.2%}")
    print(f"3. Mean Difference (Effect Size): {mean_diff:+.2%} pts")
    print(f"4. Significance (p-value): {p_val:.5f}")
    
    if p_val < 0.05:
        print("SIGNIFICANT: The day effect is REAL. The same user performs differently on these days.")
    else:
        print("NOT SIGNIFICANT: Selection Bias confirmed. The variation is due to WHO is working, not WHEN.")
        
    return valid_users

# --- EXECUTE ---
selection_bias_df = analyze_selection_bias(user_info_df, day_safe_col='is_monday', day_risky_col='is_friday', min_tasks=5)


controlling for the selection bias and looking at average wonky rates for friday being almost double monday! [using paired t test] suggesting the day effect might be real

In [0]:
main_features += list(temporal_features)

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

In [0]:
# # Visualize chi-squared statistics
# if len(chi_square_results) > 0:
#     fig_chi2 = create_chi_squared_bar_chart(
#         chi_square_results,
#         chi2_col='chi2',
#         p_value_col='chi_p_value',
#         significance_level=0.01,
#         title="Chi-Squared Statistic by Temporal Feature"
#     )
#     fig_chi2.show()
# else:
#     print("No chi-squared test results available for visualization")


In [0]:
# if len(chi_square_results) > 0:
#     delta_results = calculate_temporal_feature_deltas(
#       user_info_df,
#       temporal_features=temporal_features,
#       group_col='wonky_task_instances',  
#       group_threshold=0 
# )
    
#     if len(delta_results) > 0:
#         fig_dual = create_chi_squared_delta_dual_axis_chart(
#             chi_square_results,
#             delta_results,
#             chi2_col='chi2',
#             p_value_col='chi_p_value',
#             delta_col='delta_pct',
#             significance_level=0.01,
#             title="Chi-Squared Statistic and Delta % by Temporal Feature"
#         )
#         fig_dual.show()
#     else:
#         print("No delta results available for visualization")
# else:
#     print("No chi-squared test results available for visualization")

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

#### Task speed features - OKAY HYPOTHESIS

In [0]:
# capping because of very anomalous time throwing off the average

user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
sorted(user_info_df.columns)

In [0]:
[col for col in user_info_df.columns if 'length' in col]

In [0]:
user_info_df['task_time_taken_s_capped'].max()

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s_capped",
    use_std_dev=True,
    group_by_col="task_length_of_task",
    min_group_size=5    
)

mean_time = user_info_df["task_time_taken_s_capped"].mean()
std_time = user_info_df["task_time_taken_s_capped"].std()
print(f"Task time statistics:")
print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
print()

# Display breakdown with wonky vs non-wonky comparison
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col='wonky_task_instances',
    group_threshold=0
))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

##### Tests

In [0]:
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

In [0]:
user_info_df[speed_features + ['wonky_study_count']]

In [0]:
chi_square_results_task_speeds = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_task_speeds.reset_index())

In [0]:
# mannwhitney_results_task_speeds = perform_mannwhitney_tests(
#     user_info_df, speed_features, group_var="wonky_study_count"
# )
# display(mannwhitney_results_task_speeds.reset_index())

In [0]:
# welch_results_task_speeds = perform_welch_ttests(
#     user_info_df, speed_features, group_var="wonky_study_count"
# )
# display(welch_results_task_speeds.reset_index())

In [0]:
ztest_results_task_speeds = perform_two_proportion_z_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
display(ztest_results_task_speeds.reset_index())

In [0]:
main_features += speed_features

normal speed has highest magnitude and positive direction in z test -> counter intuitive -> problem could be comprehension rather than rushing/gaming

fast defined as -> 1 standard deviation faster than the average time of the group

fast seems relative safe and slow is safe suggesting might not be a speed thing and more comprehension issue


TODO - calibrate speeds to account for points or survey types (unsure which is best indicator - ask tim or dan)

TODO -> Come up with good Viz across test

#### Task and Point categroy - WEAK HYPOTHESIS

In [0]:
# user_info_df['defined_task_category'] = user_info_df['taskCategory'].astype(str) + "_points_" + user_info_df['payoutPoints'].astype(str)

# # Import the function
# from eda.statistical_tests import compare_task_category_wonky_rates

# # Create the summary table directly from user_info_df
# # Why: Analyzes wonky rates by task category to identify which categories have higher fraud rates
# category_wonky_summary = compare_task_category_wonky_rates(
#     df=user_info_df,
#     category_col="defined_task_category",
#     respondent_id_col="respondentPk",
#     wonky_col="wonky_study_count"
# )

# # Display the results
# print("Wonky Rates by Task Category:")
# print("=" * 100)
# display(category_wonky_summary)

# # Format for better readability
# category_wonky_summary_formatted = category_wonky_summary.copy()
# category_wonky_summary_formatted['wonky_pct'] = category_wonky_summary_formatted['wonky_pct'].round(2)
# category_wonky_summary_formatted['non_wonky_pct'] = category_wonky_summary_formatted['non_wonky_pct'].round(2)
# category_wonky_summary_formatted['proportion_delta'] = category_wonky_summary_formatted['proportion_delta'].round(2)

# display(category_wonky_summary_formatted)

#### Device

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=hardware_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

huge delta for desktop, but this might be a sample thing. may converge better with 3-6 months of data.

###### Tests

In [0]:
user_info_df[list(hardware_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_hardware = perform_chi_square_tests(
    user_info_df,
    feature_set=hardware_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_hardware.reset_index())

In [0]:
# mannwhitney_results_hardware = perform_mannwhitney_tests(
#     user_info_df, hardware_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_hardware.reset_index())

In [0]:
# welch_results_hardware = perform_welch_ttests(
#     user_info_df, hardware_cols, group_var="wonky_study_count"
# )
# display(welch_results_hardware.reset_index())

In [0]:
ztest_results_hardware = perform_two_proportion_z_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
display(ztest_results_hardware.reset_index())

high signal in desktops and iphones others are more neglible. largely driven by volume of general participants but seems like needs to be included in model

something to be investiagated in desktop process especially as usually iphone users tend to be high quality

In [0]:
main_features += ['Desktop', 'Iphone']

#### Platform

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=platform_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

##### Tests

In [0]:
user_info_df[list(platform_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_platform = perform_chi_square_tests(
    user_info_df,
    feature_set=platform_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_platform.reset_index())

In [0]:
# mannwhitney_results_platform = perform_mannwhitney_tests(
#     user_info_df, platform_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_platform.reset_index())

In [0]:
# welch_results_platform = perform_welch_ttests(
#     user_info_df, platform_cols, group_var="wonky_study_count"
# )
# display(welch_results_platform.reset_index())

In [0]:
ztest_results_platform = perform_two_proportion_z_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
display(ztest_results_platform.reset_index())

In [0]:
user_info_df[['Linux', 'iOS', 'Unknown', 'Mac OS X', 'Android', 'Windows']].sum() / len(user_info_df)

Similarly to desktop big magnitude in Linux and iOS but could be just due to volumes

Linux is a platofmr not used by most people though >> needs a follow up question
- sometimes used alot by bot farmers etc

In [0]:
main_features += ['Linux', 'iOS']

### Demographic stuff

#### Gambling

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=gambling_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

###### Tests

In [0]:
user_info_df[list(gambling_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_gambling = perform_chi_square_tests(
    user_info_df,
    feature_set=gambling_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_gambling.reset_index())

In [0]:
# mannwhitney_results_gambling = perform_mannwhitney_tests(
#     user_info_df, gambling_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_gambling.reset_index())

In [0]:

# welch_results_gambling = perform_welch_ttests(
#     user_info_df, gambling_cols, group_var="wonky_study_count"
# )
# display(welch_results_gambling.reset_index())

In [0]:
ztest_results_gambling = perform_two_proportion_z_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
display(ztest_results_gambling.reset_index())

counter to the expected results almost every form of gambling associated with lower risk to wonky studies suggests less of a gaming issue again and more of a comprehension issue.

maybe only gamblers are actually better at reading and understanding what they're doing online (attention to detail etc) resulting in less wonky studies

In [0]:
main_features += list(gambling_cols)

#### Income

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)

In [0]:
(user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)).T

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns

user_info_df = user_info_df.join(income_dummies)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['fulcrum_household_income_mapped'].value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

In [0]:
user_info_df[list(income_cols) + ['wonky_study_count']]

most wonky studies are in lower income groups

##### Test

In [0]:
chi_square_results_income = perform_chi_square_tests(
    user_info_df,
    feature_set=income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income.reset_index())

In [0]:
# mannwhitney_results_income = perform_mannwhitney_tests(
#     user_info_df, income_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_income.reset_index())

In [0]:
# welch_results_income = perform_welch_ttests(
#     user_info_df, income_cols, group_var="wonky_study_count"
# )
# display(welch_results_income.reset_index())

In [0]:
ztest_results_income = perform_two_proportion_z_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
display(ztest_results_income.reset_index())

In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = ztest_results_income.reset_index()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


In [0]:
main_features += list(income_cols)

In [0]:
len(main_features)

In [0]:
break

#### Income Gender

In [0]:
series = user_info_df['gender']

# One-hot encode each gambling mode
gender_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_dummies = gender_dummies.add_prefix('gender_')

gender_cols = gender_dummies.columns

user_info_df = user_info_df.join(gender_dummies)

In [0]:
main_features += list(gender_cols) 
main_features

In [0]:
user_info_df['gender_fulcrum_household_income_mapped'] = user_info_df['gender'].astype(str) + "_" + user_info_df['fulcrum_household_income_mapped'].astype(str)

In [0]:
series = user_info_df['gender_fulcrum_household_income_mapped']

# One-hot encode each gambling mode
gender_income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_income_cols = gender_income_dummies.columns

user_info_df = user_info_df.join(gender_income_dummies)

In [0]:
user_info_df["gender_fulcrum_household_income_mapped"].value_counts()

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['gender_fulcrum_household_income_mapped'].value_counts()

In [0]:
user_info_df[list(gender_income_cols) + ['wonky_study_count']]

strong cut off category of female less than 15k

##### Tests

In [0]:
chi_square_results_income_gender = perform_chi_square_tests(
    user_info_df,
    feature_set=gender_income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income_gender.reset_index())

In [0]:
# mannwhitney_results_income_gender = perform_mannwhitney_tests(
#     user_info_df, gender_income_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_income_gender.reset_index())

In [0]:
# welch_results_income_gender = perform_welch_ttests(
#     user_info_df, gender_income_cols, group_var="wonky_study_count"
# )
# display(welch_results_income_gender.reset_index())

In [0]:
ztest_results_income_gender = perform_two_proportion_z_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
display(ztest_results_income_gender.reset_index())

aim is to determine if there is a socio economic & gender divide in survey quality

lower incomes seem to carry significantly higher risk (again possible a comprehension or difficulty understanding the task issue)

biggest safe zones are 100k + incomes 
(200k lowest risk -5.1pp risk) with strong significance

high income correlating with less stress about money better quality answer, tech literacy and other motivations other than money for filling in surveys

highest risk in the 25k-30k category (+2.8pp)

high earning males statistically safest demograpic -> tech literate & professional demographic

riskiest cohorts:
- females 45k-50k (+1.8pp)

safe cohorts (both genders 100k+)
less safe (males < 100k, females 25k-50k)
high risk (females 25k-50k)

professionals with high income correlate with high quality.

low income shouldn't be blocked but managed >> consider other tests or qualifiers for comprehension ability

OVERALL FROM BASIC STAT TESTS KEY TAKE AWAYS

Days Active: Day 7 is critical risk.

Temporal: Friday & Business Hours are risky; Nights are safe.

Speed: Normal speed is paradoxically risky; Fast is safe.

Device/Platform: Desktop/Linux/iPhone are critical risks.

Gambling: Gamblers are safe/smart users.

Demographics: High earners are safe.

#### Exposure bands

In [0]:
wonky_respondent_df['exposure_band'].value_counts()/len(wonky_respondent_df)

In [0]:
respondent_features

### Respondent_Features

In [0]:
user_info_df['exposure_band'] = user_info_df['exposure_band'].fillna('unknown')

In [0]:
user_info_df['exposure_band'].value_counts()/len(user_info_df)

In [0]:
# Create respondent-level behavioral features
import importlib

if "eda.feature_engineering" in sys.modules:
    importlib.reload(sys.modules["eda.feature_engineering"])
    from eda.feature_engineering import create_respondent_behavioral_features

respondent_features = create_respondent_behavioral_features(
    user_info_df,
    respondent_id_col="respondentPk",
    date_col="date_completed",
    config={
        "high_volume_percentile": feature_config["volume_thresholds"][
            "high_volume_percentile"
        ],
        "extreme_volume_percentile": feature_config["volume_thresholds"][
            "extreme_volume_percentile"
        ],
        "velocity_bins": feature_config["velocity_bins"],
        "velocity_labels": feature_config["velocity_labels"],
    },
    categorical_cols= main_features,
)

print(f"Aggregated to {respondent_features.shape[0]:,} respondents")
print(f"Avg tasks per respondent: {respondent_features['total_tasks'].mean():.2f}")

In [0]:
respondent_features['days_active_all_tasks'] = (respondent_features['last_task_date'] - respondent_features['first_task_date']).dt.days

In [0]:
# Add wonky features using modular function
respondent_features = add_wonky_features(
    respondent_features, wonky_counts, respondent_id_col="respondentPk"
)

print(f"Wonky features added")
print(
    f"Respondents with wonky tasks: {(respondent_features['wonky_task_ratio'] > 0).sum():,}"
)
print(
    f"High wonky concentration (>50%): {respondent_features['is_high_wonky'].sum():,}"
)

In [0]:
respondent_features.sort_values('respondentPk')

In [0]:
cols_to_drop = ['wonky_study_flag', 'wonky_task_instances', 'wonky_unique_tasks', 'total_wonky_studies']

In [0]:
respondent_features = respondent_features.drop(cols_to_drop, axis=1)

In [0]:
respondent_features.info()

In [0]:
respondent_features['wonky_task_ratio'] = respondent_features['wonky_study_count'] / respondent_features['total_tasks']

In [0]:
unit8_cols = [col for col in respondent_features.columns if respondent_features[col].dtype == 'uint8']

for col in unit8_cols:
    respondent_features[col] = respondent_features[col].astype('int64')

In [0]:
respondent_features['wonky_study_count'].max()

In [0]:
# Correlation Analysis: All Features vs wonky_study_count

# Ensure wonky_study_count exists and handle missing values
if 'wonky_study_count' not in respondent_features.columns:
    raise ValueError("'wonky_study_count' column not found in respondent_features")

# Fill NaN values in wonky_study_count with 0 for correlation analysis
target_col = 'wonky_study_count'
df_corr = respondent_features.copy()
df_corr[target_col] = df_corr[target_col].fillna(0)

# Identify columns to exclude from correlation analysis
exclude_cols = [
    'respondentPk',  # ID column
    target_col,  # Target variable itself
]

# Identify date columns (exclude from correlation)
date_cols = df_corr.select_dtypes(include=['datetime64']).columns.tolist()
exclude_cols.extend(date_cols)

# Get all feature columns (exclude ID, target, and date columns)
feature_cols = [col for col in df_corr.columns if col not in exclude_cols]

print(f"Total features to analyze: {len(feature_cols)}")
print(f"Excluded columns: {exclude_cols}")
print(f"Date columns excluded: {date_cols}")
print()

# Calculate correlations
correlation_results = []

for feature in feature_cols:
    # Skip if feature has no variance (constant column)
    if df_corr[feature].nunique() <= 1:
        continue
    
    # Try to convert object columns to numeric if possible
    feature_series = df_corr[feature].copy()
    if feature_series.dtype == 'object':
        try:
            # Try to convert to numeric
            feature_series = pd.to_numeric(feature_series, errors='coerce')
            # If conversion results in all NaN, skip this feature
            if feature_series.isna().all():
                continue
        except:
            # If conversion fails, skip this feature (non-numeric categorical)
            continue
    
    # Get valid pairs (non-null for both feature and target)
    valid_mask = feature_series.notna() & df_corr[target_col].notna()
    n_valid = valid_mask.sum()
    
    if n_valid < 10:  # Need at least 10 observations
        continue
    
    # Calculate correlation
    try:
        corr_value = feature_series.loc[valid_mask].corr(df_corr.loc[valid_mask, target_col])
        
        # Handle case where correlation is NaN (e.g., constant feature after filtering)
        if pd.notna(corr_value):
            correlation_results.append({
                'feature': feature,
                'correlation': corr_value,
                'abs_correlation': abs(corr_value),
                'sample_size': n_valid,
                'feature_type': 'numeric' if df_corr[feature].dtype in ['int64', 'float64'] else 'categorical'
            })
    except Exception as e:
        # Skip features that cause errors (e.g., non-numeric types that can't be correlated)
        continue

# Convert to DataFrame and sort by absolute correlation
corr_df = pd.DataFrame(correlation_results)

if len(corr_df) > 0:
    corr_df = corr_df.sort_values('abs_correlation', ascending=False).reset_index(drop=True)
    
    print("=" * 80)
    print("CORRELATION ANALYSIS: Features vs wonky_study_count")
    print("=" * 80)
    print(f"\nTotal features analyzed: {len(corr_df)}")
    print(f"Features with positive correlation: {(corr_df['correlation'] > 0).sum()}")
    print(f"Features with negative correlation: {(corr_df['correlation'] < 0).sum()}")
    print()
    
    # Display top correlations
    print("TOP 30 FEATURES BY ABSOLUTE CORRELATION:")
    print("-" * 80)
    display(corr_df.head(100)[['feature', 'correlation', 'abs_correlation', 'sample_size', 'feature_type']])
    
    # Summary statistics
    print("\nCORRELATION SUMMARY STATISTICS:")
    print("-" * 80)
    print(f"Mean absolute correlation: {corr_df['abs_correlation'].mean():.4f}")
    print(f"Median absolute correlation: {corr_df['abs_correlation'].median():.4f}")
    print(f"Max correlation: {corr_df['correlation'].max():.4f}")
    print(f"Min correlation: {corr_df['correlation'].min():.4f}")
    print(f"Features with |correlation| > 0.1: {(corr_df['abs_correlation'] > 0.1).sum()}")
    print(f"Features with |correlation| > 0.2: {(corr_df['abs_correlation'] > 0.2).sum()}")
    print(f"Features with |correlation| > 0.3: {(corr_df['abs_correlation'] > 0.3).sum()}")
    
else:
    print("No valid correlations could be calculated.")

weak correlations across the board some strong ones device related but nothing huge to go by.

Next stage run analysis on 3 months of data 

In [0]:
respondent_features_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('respondent_features')))

respondent_features.to_parquet(respondent_features_path, index=False)