Exec Summary

-
-
-
-
-

In [0]:
%pip install pyyaml>=6.0 -q

In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    compare_demographic_groups
)
from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
wonky_counts

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

TODO TEST USING CHI2 and ZTEST

### Temporal Feature Analysis & Breakdowns

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_task_instances'].unique()

In [0]:
temporal_features = [
    "is_weekday",
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=temporal_features,
        group_col="wonky_task_instances",
        group_threshold=0,
    )
)

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

#### Chi-Squared Tests for Temporal Features

Testing independence between temporal features and wonky study participation.
Chi-squared test determines if temporal patterns differ significantly between wonky and non-wonky groups.


In [0]:
chi_square_results = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_task_instances',
    significance_level=0.01
)

chi_square_results

**Strong significance for all subsets but wednesday and friday although this doesn't taken into consideration directionality -> simple separators are business hours**

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

In [0]:
# Summary
if len(chi_square_results) > 0:
    significant_features = chi_square_results[chi_square_results['significant'] == True]
    print(f"\nSignificant temporal features (p < 0.01): {len(significant_features)}")
    if len(significant_features) > 0:
        print("Features:", ', '.join(significant_features.index.tolist()))

In [0]:
# Visualize chi-squared statistics
if len(chi_square_results) > 0:
    fig_chi2 = create_chi_squared_bar_chart(
        chi_square_results,
        chi2_col='chi2',
        p_value_col='chi_p_value',
        significance_level=0.01,
        title="Chi-Squared Statistic by Temporal Feature"
    )
    fig_chi2.show()
else:
    print("No chi-squared test results available for visualization")


In [0]:
if len(chi_square_results) > 0:
    delta_results = calculate_temporal_feature_deltas(
      user_info_df,
      temporal_features=temporal_features,
      group_col='wonky_task_instances',  
      group_threshold=0 
)
    
    if len(delta_results) > 0:
        fig_dual = create_chi_squared_delta_dual_axis_chart(
            chi_square_results,
            delta_results,
            chi2_col='chi2',
            p_value_col='chi_p_value',
            delta_col='delta_pct',
            significance_level=0.01,
            title="Chi-Squared Statistic and Delta % by Temporal Feature"
        )
        fig_dual.show()
    else:
        print("No delta results available for visualization")
else:
    print("No chi-squared test results available for visualization")

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

### Task speed features

In [0]:
user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s_capped",
    use_std_dev=True
)

mean_time = user_info_df["task_time_taken_s_capped"].mean()
std_time = user_info_df["task_time_taken_s_capped"].std()
print(f"Task time statistics:")
print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
print()

# Display breakdown with wonky vs non-wonky comparison
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col='wonky_task_instances',
    group_threshold=0
))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

In [0]:
from eda.statistical_tests import compare_speed_categories_proportions

# Statistical tests for speed categories: Chi-squared and Two-Proportion Z-Test
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

# Ensure we have the correct group column
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'

print("=" * 80)
print("STATISTICAL TESTS: Speed Categories vs Wonky Status")
print("=" * 80)
print()

# 1. Chi-squared test (overall association)
print("1. CHI-SQUARED TEST (Overall Association)")
print("-" * 80)
print("Tests if speed category distribution is independent of wonky task status")
print()

chi2_speed_results = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var=group_col_to_use,
    significance_level=0.01
)

if len(chi2_speed_results) > 0:
    print("Results:")
    display(chi2_speed_results)
    print()
    
    significant_features = chi2_speed_results[chi2_speed_results['significant']]
    if len(significant_features) > 0:
        print(f"✓ Significant association found for {len(significant_features)} speed category(ies)")
        print(f"  Features: {', '.join(significant_features.index.tolist())}")
    else:
        print("✗ No significant association found (p > 0.05)")
else:
    print("No results available (check if speed features exist in DataFrame)")
print()

# 2. Two-proportion Z-test (individual category comparisons)
print("2. TWO-PROPORTION Z-TEST (Individual Category Comparisons)")
print("-" * 80)
print("Tests if proportion differs significantly for each speed category")
print()

ztest_speed_results = compare_speed_categories_proportions(
    user_info_df,
    speed_features=speed_features,
    group_col=group_col_to_use,
    group_threshold=0,
    significance_level=0.01
)

if len(ztest_speed_results) > 0:
    print("Results:")
    display(ztest_speed_results.reset_index(drop=True))
    print()
    
    # Format results for easier interpretation
    print("Summary:")
    for _, row in ztest_speed_results.iterrows():
        feature_name = row['feature'].replace('is_', '').replace('_', ' ').title()
        sig_marker = "***" if row['significant'] else ""
        print(f"  {feature_name}:")
        print(f"    Wonky: {row['wonky_proportion']*100:.2f}% ({row['wonky_count']:,}/{row['wonky_total']:,})")
        print(f"    Non-wonky: {row['non_wonky_proportion']*100:.2f}% ({row['non_wonky_count']:,}/{row['non_wonky_total']:,})")
        print(f"    Difference: {row['proportion_diff']*100:+.2f}%")
        print(f"    Z-statistic: {row['z_statistic']:.3f}, p-value: {row['p_value']:.4f} {sig_marker}")
        print()
    
    significant_categories = ztest_speed_results[ztest_speed_results['significant']]
    if len(significant_categories) > 0:
        print(f"✓ Significant differences found for {len(significant_categories)} category(ies):")
        for _, row in significant_categories.iterrows():
            feature_name = row['feature'].replace('is_', '').replace('_', ' ').title()
            direction = "higher" if row['proportion_diff'] > 0 else "lower"
            print(f"  - {feature_name}: Wonky tasks have {abs(row['proportion_diff']*100):.2f}% {direction} proportion")
    else:
        print("✗ No significant differences found (p > 0.05 for all categories)")
else:
    print("No results available (check if speed features exist in DataFrame)")
print()

print("=" * 80)
print("Interpretation:")
print("- Chi-squared test: Overall test of independence (all categories together)")
print("- Two-proportion Z-test: Individual tests for each category")
print("- Significant results (p < 0.01) indicate wonky and non-wonky groups differ")
print("=" * 80)

TODO -> rethink Viz for this section all significant across z test and chi squared test

### Task and Point categroy

In [0]:
user_info_df['defined_task_category'] = user_info_df['taskCategory'].astype(str) + "_points_" + user_info_df['payoutPoints'].astype(str)

In [0]:
user_info_df['defined_task_category'].value_counts()

In [0]:
break

In [0]:
# Create task speed features using modular function
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    suspicious_threshold=feature_config["time_thresholds"]["suspicious_fast_seconds"],
    very_fast_threshold=feature_config["time_thresholds"]["very_fast_seconds"],
    very_slow_threshold=feature_config["time_thresholds"]["very_slow_minutes"] * 60,
)

print(
    f"Suspiciously fast (<{feature_config['time_thresholds']['suspicious_fast_seconds']}s): {user_info_df['is_suspiciously_fast'].sum():,} ({user_info_df['is_suspiciously_fast'].mean()*100:.2f}%)"
)

print(
    f"very_fast_seconds fast (<{feature_config['time_thresholds']['very_fast_seconds']}s): {user_info_df['is_very_fast'].sum():,} ({user_info_df['is_very_fast'].mean()*100:.2f}%)"
)

print(f"very_slow_minutes fast (<{feature_config['time_thresholds']['very_slow_minutes']}s): {user_info_df['is_very_slow'].sum():,} ({user_info_df['is_very_slow'].mean()*100:.2f}%)")

### Respondent level features

In [0]:
# Create respondent-level behavioral features

import importlib
if 'eda.feature_engineering' in sys.modules:
    importlib.reload(sys.modules['eda.feature_engineering'])
    from eda.feature_engineering import create_respondent_behavioral_features

respondent_features = create_respondent_behavioral_features(
    user_info_df,
    respondent_id_col="respondentPk",
    date_col="date_completed",
    config={
        'high_volume_percentile': feature_config['volume_thresholds']['high_volume_percentile'],
        'extreme_volume_percentile': feature_config['volume_thresholds']['extreme_volume_percentile'],
        'velocity_bins': feature_config['velocity_bins'],
        'velocity_labels': feature_config['velocity_labels']
    },
    demographic_cols=['platform_name', 'hardware_version', 'survey_locale'],
)

print(f"Aggregated to {respondent_features.shape[0]:,} respondents")
print(f"Avg tasks per respondent: {respondent_features['total_tasks'].mean():.2f}")
print(f"Avg suspicious fast rate: {respondent_features['suspicious_fast_rate'].mean()*100:.2f}%")

In [0]:
respondent_features

In [0]:
# Add wonky features using modular function
respondent_features = add_wonky_features(
    respondent_features,
    wonky_studies_df,
    respondent_id_col="respondentPk"
)

print(f"Wonky features added")
print(f"Respondents with wonky tasks: {(respondent_features['wonky_task_ratio'] > 0).sum():,}")
print(f"High wonky concentration (>50%): {respondent_features['is_high_wonky'].sum():,}")

### Distribution Comparisons

Comparing distributions of key features between wonky and non-wonky groups to visualize differences.


In [0]:
respondent_features['wonky_task_ratio_capped'] = np.where(respondent_features['wonky_task_ratio'] > 1, 1, respondent_features['wonky_task_ratio'])

In [0]:
# Distribution comparisons for key features
if 'respondent_features' not in globals():
    raise ValueError("'respondent_features' DataFrame not found. Please run previous cells to create it.")

# Ensure has_wonky_tasks column exists
if 'has_wonky_tasks' not in respondent_features.columns:
    if 'wonky_task_ratio' in respondent_features.columns:
        print("Warning: 'has_wonky_tasks' column not found. Creating it from 'wonky_task_ratio'...")
        respondent_features['has_wonky_tasks'] = (respondent_features['wonky_task_ratio_capped'] > 0).astype(int)
    else:
        raise ValueError(
            "'has_wonky_tasks' column not found and 'wonky_task_ratio' is also missing. "
            "Cannot create distribution comparisons. Available columns: "
            f"{list(respondent_features.columns)[:30]}"
        )

key_features_for_dist = ['total_tasks', 'suspicious_fast_rate', 'days_active', 
                         'avg_task_time', 'wonky_task_ratio_capped']

# Filter to only features that exist
available_features = [f for f in key_features_for_dist if f in respondent_features.columns]
missing_features = [f for f in key_features_for_dist if f not in respondent_features.columns]

if missing_features:
    print(f"Warning: Some features not found: {missing_features}")
    print(f"   Available columns: {list(respondent_features.columns)[:30]}")

if len(available_features) == 0:
    print("Error: None of the requested features are available in respondent_features.")
    print(f"   Requested: {key_features_for_dist}")
    print(f"   Available: {list(respondent_features.columns)[:30]}")
else:
    print(f"✓ Creating distribution comparisons for {len(available_features)} features: {available_features}")

for feature in available_features:
    try:
        # Create histogram comparison
        fig_dist = create_distribution_comparison(
            respondent_features,
            feature=feature,
            group_col='has_wonky_tasks',
            group1_value=1,
            group2_value=0,
            plot_type='histogram',
            group1_name='Wonky Users',
            group2_name='Non-Wonky Users',
            title=f'{feature.replace("_", " ").title()} Distribution: Wonky vs Non-Wonky Users'
        )
        fig_dist.show()
        
        # Create box plot comparison
        fig_box = create_distribution_comparison(
            respondent_features,
            feature=feature,
            group_col='has_wonky_tasks',
            group1_value=1,
            group2_value=0,
            plot_type='box',
            group1_name='Wonky Users',
            group2_name='Non-Wonky Users',
            title=f'{feature.replace("_", " ").title()} Distribution: Wonky vs Non-Wonky Users'
        )
        fig_box.show()
    except Exception as e:
        print(f"Error creating distribution comparison for '{feature}': {e}")
        continue


In [0]:
respondent_features.head()

In [0]:
# Feature summary tables

feature_breakdown = create_feature_breakdown_table(
    respondent_features,
    feature_col='has_wonky_tasks',
    group_col='has_wonky_tasks',
    group1_value=1,
    group2_value=0,
    metrics=key_features_for_dist
)

In [0]:
feature_breakdown.T

### Demographic Group Comparisons

Comparing wonky task rates across demographic groups (platforms, hardware versions, locales) using statistical tests.
This helps identify if certain demographics are more associated with wonky study participation.


In [0]:
respondent_features

In [0]:
# Compare wonky task rates across demographic groups

demographic_cols = ['platform_name', 'hardware_version', 'survey_locale']

demographic_results = {}

for demo_col in demographic_cols:
    if demo_col in respondent_features.columns:
        print(demo_col)
        print(f"\n{'=' * 80}")
        print(f"Demographic Comparison: {demo_col}")
        print('=' * 80)
        
        # Perform comparisons
        demo_comparisons = compare_demographic_groups(
            respondent_features,
            demographic_col=demo_col,
            target_col='wonky_task_ratio',
            min_group_size=10,
            significance_level=0.05
        )
        
        if len(demo_comparisons) > 0:
            demographic_results[demo_col] = demo_comparisons
            
            # Show summary statistics by group
            group_summary = respondent_features.groupby(demo_col).agg({
                'wonky_task_ratio': ['mean', 'median', 'count'],
                'has_wonky_tasks': 'sum'
            }).round(4)
            group_summary.columns = ['mean_wonky_ratio', 'median_wonky_ratio', 'total_count', 'wonky_count']
            group_summary['wonky_rate'] = (group_summary['wonky_count'] / group_summary['total_count'] * 100).round(2)
            group_summary = group_summary.sort_values('mean_wonky_ratio', ascending=False)
            
            print(f"\nSummary by {demo_col}:")
            display(group_summary)
            
            # Show significant comparisons
            significant_comps = demo_comparisons[demo_comparisons['mw_significant'] == True]
            if len(significant_comps) > 0:
                print(f"\nSignificant differences (p < 0.05): {len(significant_comps)}")
                display(significant_comps[['group1', 'group2', 'mean_difference', 'mw_p_value', 'welch_p_value', 'tests_agree']])
            else:
                print("\nNo significant differences found between groups")
        else:
            print(f"Insufficient data for {demo_col} comparisons")


In [0]:
# Visualize demographic comparisons
# Why: Bar charts make it easy to see which demographic groups have higher wonky task rates
# Helps identify patterns that may inform modeling or business decisions

for demo_col in demographic_cols:
    if demo_col in respondent_features.columns and demo_col in demographic_results:
        # Create summary by group
        demo_summary = respondent_features.groupby(demo_col).agg({
            'wonky_task_ratio': 'mean',
            'has_wonky_tasks': ['sum', 'count']
        }).reset_index()
        demo_summary.columns = [demo_col, 'mean_wonky_ratio', 'wonky_count', 'total_count']
        demo_summary['wonky_rate'] = (demo_summary['wonky_count'] / demo_summary['total_count'] * 100)
        demo_summary = demo_summary.sort_values('mean_wonky_ratio', ascending=False)
        
        # Bar chart of wonky rates by demographic group
        fig_demo = create_bar_plot(
            demo_summary,
            x=demo_col,
            y='wonky_rate',
            title=f'Wonky Task Rate by {demo_col.replace("_", " ").title()}',
            labels={demo_col: demo_col.replace("_", " ").title(), 'wonky_rate': 'Wonky Task Rate (%)'},
            color='wonky_rate',
            color_continuous_scale='Reds',
            text='wonky_rate',
            texttemplate='%{text:.1f}%',
            textposition='outside',
            tickangle=45
        )
        fig_demo.show()
        
        # Box plot comparing wonky task ratios across groups
        fig_demo_box = create_box_plot(
            respondent_features,
            x=demo_col,
            y='wonky_task_ratio',
            title=f'Wonky Task Ratio Distribution by {demo_col.replace("_", " ").title()}',
            labels={demo_col: demo_col.replace("_", " ").title(), 'wonky_task_ratio': 'Wonky Task Ratio'},
            tickangle=45
        )
        fig_demo_box.show()


### Feature Selection Guidance

Based on EDA results, identify which features show strongest discrimination and should be prioritized for modeling.


#### Modeling Recommendations

Based on EDA findings, recommendations for feature selection and modeling approach.


In [0]:
# Generate modeling recommendations
# Why: Synthesizes EDA findings into actionable recommendations for model development
# Note: Requires statistical_results to be created first (run the statistical tests cell before this one)

recommendations = []

# Check statistical test results
if 'statistical_results' not in globals():
    recommendations.append("⚠️  Warning: 'statistical_results' not found. Run the 'Hypothesis Testing' section first.")
elif len(statistical_results) > 0:
    significant_features = statistical_results[
        (statistical_results['mw_significant'] == True) & 
        (statistical_results.get('welch_significant', pd.Series([True] * len(statistical_results))) == True)
    ]
    
    if len(significant_features) > 0:
        recommendations.append(f"✓ {len(significant_features)} features show significant differences (both tests agree)")
        recommendations.append(f"  → Prioritize these features: {', '.join(significant_features['metric'].head(5).tolist())}")
    
    # Check effect sizes
    if 'mean_difference' in statistical_results.columns:
        large_effect = statistical_results[statistical_results['mean_difference'].abs() > statistical_results['mean_difference'].abs().quantile(0.75)]
        if len(large_effect) > 0:
            recommendations.append(f"✓ {len(large_effect)} features show large effect sizes")
            recommendations.append(f"  → These features have practical significance: {', '.join(large_effect['metric'].head(3).tolist())}")

# Check temporal features
if len(chi_square_results) > 0:
    sig_temporal = chi_square_results[chi_square_results['significant'] == True]
    if len(sig_temporal) > 0:
        recommendations.append(f"✓ {len(sig_temporal)} temporal features are significantly associated with wonky studies")
        recommendations.append(f"  → Include temporal features: {', '.join(sig_temporal.index.tolist())}")

# Check demographic differences
if demographic_results:
    total_sig_demo = sum(len(df[df['mw_significant'] == True]) for df in demographic_results.values())
    if total_sig_demo > 0:
        recommendations.append(f"✓ Found {total_sig_demo} significant demographic group differences")
        recommendations.append(f"  → Consider including demographic features as interaction terms")

print("=" * 80)
print("MODELING RECOMMENDATIONS")
print("=" * 80)
print()
for rec in recommendations:
    print(rec)
print()
print("=" * 80)
print("NEXT STEPS:")
print("=" * 80)
print("1. Use top-ranked features from feature ranking for initial model")
print("2. Include temporal features that show significant associations")
print("3. Consider demographic interactions if significant differences found")
print("4. Validate feature importance using model-based methods (e.g., Random Forest feature importance)")
print("5. Monitor model performance on features identified as significant in EDA")


In [0]:
# Create fraud risk score using modular function
respondent_features = create_fraud_risk_score(
    respondent_features,
    config={
        'fraud_score_weights': feature_config['fraud_score_weights'],
        'fraud_score_thresholds': feature_config['fraud_score_thresholds'],
        'fraud_score_bins': feature_config['fraud_score_bins'],
        'fraud_score_labels': feature_config['fraud_score_labels'],
        'suspected_fraud_threshold': feature_config['suspected_fraud_threshold']
    }
)

# Create wonky risk score
respondent_features = create_wonky_risk_score(
    respondent_features,
    config={
        'wonky_score_weights': feature_config['wonky_score_weights'],
        'wonky_score_thresholds': feature_config['wonky_score_thresholds'],
        'wonky_score_bins': feature_config['wonky_score_bins'],
        'wonky_score_labels': feature_config['wonky_score_labels']
    }
)

print("Fraud risk distribution:")
print(respondent_features['fraud_risk_tier'].value_counts().sort_index())
print(f"Suspected fraud rate: {respondent_features['suspected_fraud'].mean()*100:.2f}%")


### Hypothesis Testing

Running Mann-Whitney U test and Welch's t-test for validation.

Check if Mann-Whitney U and Welch's t-test agree on significance.
When both tests agree, we have higher confidence in the results.
When they disagree, it signals need for further investigation.

Compare wonky vs non-wonky users using statistical tests.
- Mann-Whitney U test (non-parametric, primary test)
- Welch's t-test (parametric validator/sense check)
Uses `compare_groups_with_both_tests()` to run both tests simultaneously.

In [0]:
# Create binary flag for wonky users
respondent_features['has_wonky_tasks'] = (respondent_features['wonky_task_ratio'] > 0).astype(int)

print(f"Wonky users: {respondent_features['has_wonky_tasks'].sum():,} ({respondent_features['has_wonky_tasks'].mean()*100:.2f}%)")
print(f"Non-wonky users: {(respondent_features['has_wonky_tasks'] == 0).sum():,} ({(respondent_features['has_wonky_tasks'] == 0).mean()*100:.2f}%)")

### Statistical Test Results Visualization

Visualizing statistical test results to identify features with strongest discrimination between wonky and non-wonky groups.

In [0]:
# Perform statistical tests using modular function
statistical_results = compare_groups_with_both_tests(
    respondent_features,
    group_col=stats_config['group_comparison']['group_col'],
    metrics=stats_config['test_metrics'],
    group1_value=stats_config['group_comparison']['group1_value'],
    group2_value=stats_config['group_comparison']['group2_value'],
    significance_level=stats_config['significance_level']
)

print("Statistical Test Results (Mann-Whitney U + Welch's t-test):")
print("=" * 100)
display(statistical_results)


In [0]:
# Create dual-axis chart showing count differences and Welch's t-statistic

# Check if statistical_results exists
if 'statistical_results' not in globals():
    print("Warning: 'statistical_results' not found.")
    print("   Please run the 'Hypothesis Testing' section cells first to create statistical_results.")
    print("   This visualization will be skipped.")
elif len(statistical_results) > 0 and 'welch_statistic' in statistical_results.columns:
    # Calc normalized count differences
    testdf_2 = statistical_results.copy()
    testdf_2['count_difference'] = testdf_2['wonky_mean'] - testdf_2['non_wonky_mean']
    testdf_2['count_difference_nrm'] = testdf_2['count_difference'] / (testdf_2['non_wonky_mean'].replace(0, np.nan))
    
    testdf_2 = testdf_2.set_index('metric')
    
    fig_dual = create_dual_axis_statistical_chart(
        testdf_2,
        count_diff_col='count_difference_nrm',
        t_stat_col='welch_statistic',
        title="Ave Wonky Count Difference & Welch's t-statistic by Feature"
    )
    fig_dual.show()
else:
    print("Statistical results not available for dual-axis chart")
    if 'statistical_results' in globals():
        print(f"   statistical_results exists but is empty or missing 'welch_statistic' column")
        print(f"   Available columns: {list(statistical_results.columns) if len(statistical_results) > 0 else 'N/A'}")


In [0]:
# Feature importance summary
# Why: Ranks features by statistical significance, effect size, and test agreement
# Helps prioritize which features to include in models
# Note: Requires statistical_results to be created first (run the statistical tests cell before this one)

if 'statistical_results' not in globals():
    print("⚠️  Warning: 'statistical_results' not found.")
    print("   Please run the 'Hypothesis Testing' section cells first to create statistical_results.")
elif len(statistical_results) > 0:
    # Create feature ranking
    feature_ranking = statistical_results.copy()
    
    # Calculate composite score (combining significance and effect size)
    if 'welch_statistic' in feature_ranking.columns:
        # Use absolute t-statistic as effect size indicator
        feature_ranking['effect_size'] = feature_ranking['welch_statistic'].abs()
    else:
        feature_ranking['effect_size'] = feature_ranking['mean_difference'].abs()
    
    # Create ranking score (lower p-value + higher effect size = better)
    feature_ranking['ranking_score'] = (
        (1 - feature_ranking['mw_p_value'].clip(0, 1)) * 0.5 +  # Significance component
        (feature_ranking['effect_size'] / feature_ranking['effect_size'].max()) * 0.5  # Effect size component
    )
    
    # Add test agreement indicator
    if 'tests_agree' in feature_ranking.columns:
        feature_ranking['both_tests_agree'] = feature_ranking['tests_agree']
    else:
        feature_ranking['both_tests_agree'] = True
    
    # Sort by ranking score
    feature_ranking = feature_ranking.sort_values('ranking_score', ascending=False)
    
    # Select key columns for display
    display_cols = ['metric', 'mean_difference', 'mw_p_value', 'welch_p_value']
    if 'welch_statistic' in feature_ranking.columns:
        display_cols.append('welch_statistic')
    if 'both_tests_agree' in feature_ranking.columns:
        display_cols.append('both_tests_agree')
    display_cols.append('ranking_score')
    
    available_cols = [col for col in display_cols if col in feature_ranking.columns]
    
    print("Feature Ranking for Modeling (Top Features)")
    print("=" * 100)
    print("Ranked by: Statistical significance + Effect size")
    print("Higher ranking_score = better feature for modeling")
    print()
    display(feature_ranking[available_cols].head(20))
    
    # Identify top features
    top_features = feature_ranking.head(10)['metric'].tolist()
    print(f"\nTop 10 Features Recommended for Modeling:")
    for i, feat in enumerate(top_features, 1):
        print(f"  {i}. {feat}")
else:
    print("Statistical results not available for feature ranking")


In [0]:
# Analyze test agreement
# Note: Requires statistical_results to be created first (run the statistical tests cell before this one)

if 'statistical_results' not in globals():
    print("⚠️  Warning: 'statistical_results' not found.")
    print("   Please run the 'Hypothesis Testing' section cells first to create statistical_results.")
elif 'tests_agree' in statistical_results.columns:
    agreement_stats = statistical_results['tests_agree'].value_counts()
    print("Test Agreement Analysis:")
    print(f"Metrics where tests agree: {agreement_stats.get(True, 0)}")
    print(f"Metrics where tests disagree: {agreement_stats.get(False, 0)}")
    
    # Show metrics where tests disagree
    disagree = statistical_results[statistical_results['tests_agree'] == False]
    if len(disagree) > 0:
        print("\nMetrics where tests disagree (need investigation):")
        display(disagree[['metric', 'mw_p_value', 'welch_p_value', 'mw_significant', 'welch_significant']])


In [0]:
# Analyze thresholds for key features
print("Threshold Analysis: Task Volume")
print("=" * 80)
volume_thresholds = analyze_thresholds(
    respondent_features,
    feature='total_tasks',
    bins=stats_config['threshold_analysis']['default_bins'],
    target_col='has_wonky_tasks'
)
display(volume_thresholds)

print("\nThreshold Analysis: Suspicious Fast Rate")
print("=" * 80)
speed_thresholds = analyze_thresholds(
    respondent_features,
    feature='suspicious_fast_rate',
    bins=stats_config['threshold_analysis']['default_bins'],
    target_col='has_wonky_tasks'
)
display(speed_thresholds)

In [0]:
# Example visualizations using modular functions

# Histogram of fraud risk score
fig_fraud_risk = create_histogram(
    respondent_features,
    x='fraud_risk_score',
    color='fraud_risk_tier',
    nbins=11,
    title='Fraud Risk Score Distribution',
    labels={'fraud_risk_score': 'Fraud Risk Score (0-10)'}
)
fig_fraud_risk.show()

# Box plot comparing wonky vs non-wonky users
fig_speed = create_box_plot(
    respondent_features,
    x='has_wonky_tasks',
    y='avg_task_time',
    color='has_wonky_tasks',
    title='Average Task Time: Wonky vs Non-Wonky Users',
    labels={'has_wonky_tasks': 'Has Wonky Tasks (1=Yes, 0=No)', 'avg_task_time': 'Average Task Time (seconds)'},
    category_orders={'has_wonky_tasks': [0, 1]},
    boxmean='sd'
)
fig_speed.show()

## Summary

**Key Findings:**
- Statistical tests (Mann-Whitney U + Welch's t-test) identify significant differences between wonky and non-wonky users
- Temporal features show strong associations with wonky study participation (chi-squared tests)
- Feature engineering creates actionable behavioral indicators (task speed, volume, velocity)
- Demographic groups (platforms, hardware, locales) show varying wonky task rates
- Both statistical tests provide validation - agreement increases confidence in findings
- Visualizations highlight features with strongest discrimination power

**Feature Selection Insights:**
- Top-ranked features combine statistical significance with practical effect sizes
- Temporal patterns (business hours, night tasks) are strong discriminators
- Behavioral features (task volume, speed) show consistent differences
- Demographic features may be useful as interaction terms

**Next Steps:**
- Use top-ranked features from feature ranking for model training
- Include temporal features that show significant chi-squared associations
- Consider demographic interactions based on group comparison results
- Apply fraud risk thresholds to filter suspicious users
- Build models using features identified as significant in EDA
- Validate feature importance using model-based methods (Random Forest feature importance)
