In [0]:
%pip install pyyaml>=6.0 -q

In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

# Import yaml with error handling
try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import our modular functions
from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds
)
from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot
)

# Load configuration files
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("âœ“ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")
os.makedirs(misc_dir, exist_ok=True)

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))


user_info_df = pd.read_parquet(output_path)
wonky_studies_df = pd.read_parquet(wonky_path)

print("Files loaded successfully:")
print(f"  - {output_path}")
print(f"  - {wonky_path}")

### Feature Engineering

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
# Create task speed features using modular function
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    suspicious_threshold=feature_config['time_thresholds']['suspicious_fast_seconds'],
    very_fast_threshold=feature_config['time_thresholds']['very_fast_seconds'],
    very_slow_threshold=feature_config['time_thresholds']['very_slow_minutes'] * 60
)

print(f"Suspiciously fast (<{feature_config['time_thresholds']['suspicious_fast_seconds']}s): {user_info_df['is_suspiciously_fast'].sum():,} ({user_info_df['is_suspiciously_fast'].mean()*100:.2f}%)")


In [0]:
# Create respondent-level behavioral features
respondent_features = create_respondent_behavioral_features(
    user_info_df,
    respondent_id_col="respondentPk",
    date_col="date_completed",
    config={
        'high_volume_percentile': feature_config['volume_thresholds']['high_volume_percentile'],
        'extreme_volume_percentile': feature_config['volume_thresholds']['extreme_volume_percentile'],
        'velocity_bins': feature_config['velocity_bins'],
        'velocity_labels': feature_config['velocity_labels']
    }
)

print(f"Aggregated to {respondent_features.shape[0]:,} respondents")
print(f"Avg tasks per respondent: {respondent_features['total_tasks'].mean():.2f}")
print(f"Avg suspicious fast rate: {respondent_features['suspicious_fast_rate'].mean()*100:.2f}%")

In [0]:
# Add wonky features using modular function
respondent_features = add_wonky_features(
    respondent_features,
    wonky_studies_df,
    respondent_id_col="respondentPk"
)

print(f"Wonky features added")
print(f"Respondents with wonky tasks: {(respondent_features['wonky_task_ratio'] > 0).sum():,}")
print(f"High wonky concentration (>50%): {respondent_features['is_high_wonky'].sum():,}")


In [0]:
# Create fraud risk score using modular function
respondent_features = create_fraud_risk_score(
    respondent_features,
    config={
        'fraud_score_weights': feature_config['fraud_score_weights'],
        'fraud_score_thresholds': feature_config['fraud_score_thresholds'],
        'fraud_score_bins': feature_config['fraud_score_bins'],
        'fraud_score_labels': feature_config['fraud_score_labels'],
        'suspected_fraud_threshold': feature_config['suspected_fraud_threshold']
    }
)

# Create wonky risk score
respondent_features = create_wonky_risk_score(
    respondent_features,
    config={
        'wonky_score_weights': feature_config['wonky_score_weights'],
        'wonky_score_thresholds': feature_config['wonky_score_thresholds'],
        'wonky_score_bins': feature_config['wonky_score_bins'],
        'wonky_score_labels': feature_config['wonky_score_labels']
    }
)

print("Fraud risk distribution:")
print(respondent_features['fraud_risk_tier'].value_counts().sort_index())
print(f"Suspected fraud rate: {respondent_features['suspected_fraud'].mean()*100:.2f}%")


### Hypothesis Testing

Running Mann-Whitney U test and Welch's t-test for validation.

Check if Mann-Whitney U and Welch's t-test agree on significance.
When both tests agree, we have higher confidence in the results.
When they disagree, it signals need for further investigation.

Compare wonky vs non-wonky users using statistical tests.
- Mann-Whitney U test (non-parametric, primary test)
- Welch's t-test (parametric validator/sense check)
Uses `compare_groups_with_both_tests()` to run both tests simultaneously.

In [0]:
# Create binary flag for wonky users
respondent_features['has_wonky_tasks'] = (respondent_features['wonky_task_ratio'] > 0).astype(int)

print(f"Wonky users: {respondent_features['has_wonky_tasks'].sum():,} ({respondent_features['has_wonky_tasks'].mean()*100:.2f}%)")
print(f"Non-wonky users: {(respondent_features['has_wonky_tasks'] == 0).sum():,} ({(respondent_features['has_wonky_tasks'] == 0).mean()*100:.2f}%)")

In [0]:
# Perform statistical tests using modular function
statistical_results = compare_groups_with_both_tests(
    respondent_features,
    group_col=stats_config['group_comparison']['group_col'],
    metrics=stats_config['test_metrics'],
    group1_value=stats_config['group_comparison']['group1_value'],
    group2_value=stats_config['group_comparison']['group2_value'],
    significance_level=stats_config['significance_level']
)

print("Statistical Test Results (Mann-Whitney U + Welch's t-test):")
print("=" * 100)
display(statistical_results)


In [0]:
# Analyze test agreement
if 'tests_agree' in statistical_results.columns:
    agreement_stats = statistical_results['tests_agree'].value_counts()
    print("Test Agreement Analysis:")
    print(f"Metrics where tests agree: {agreement_stats.get(True, 0)}")
    print(f"Metrics where tests disagree: {agreement_stats.get(False, 0)}")
    
    # Show metrics where tests disagree
    disagree = statistical_results[statistical_results['tests_agree'] == False]
    if len(disagree) > 0:
        print("\nMetrics where tests disagree (need investigation):")
        display(disagree[['metric', 'mw_p_value', 'welch_p_value', 'mw_significant', 'welch_significant']])


In [0]:
# Analyze thresholds for key features
print("Threshold Analysis: Task Volume")
print("=" * 80)
volume_thresholds = analyze_thresholds(
    respondent_features,
    feature='total_tasks',
    bins=stats_config['threshold_analysis']['default_bins'],
    target_col='has_wonky_tasks'
)
display(volume_thresholds)

print("\nThreshold Analysis: Suspicious Fast Rate")
print("=" * 80)
speed_thresholds = analyze_thresholds(
    respondent_features,
    feature='suspicious_fast_rate',
    bins=stats_config['threshold_analysis']['default_bins'],
    target_col='has_wonky_tasks'
)
display(speed_thresholds)

In [0]:
# Example visualizations using modular functions

# Histogram of fraud risk score
fig_fraud_risk = create_histogram(
    respondent_features,
    x='fraud_risk_score',
    color='fraud_risk_tier',
    nbins=11,
    title='Fraud Risk Score Distribution',
    labels={'fraud_risk_score': 'Fraud Risk Score (0-10)'}
)
fig_fraud_risk.show()

# Box plot comparing wonky vs non-wonky users
fig_speed = create_box_plot(
    respondent_features,
    x='has_wonky_tasks',
    y='avg_task_time',
    color='has_wonky_tasks',
    title='Average Task Time: Wonky vs Non-Wonky Users',
    labels={'has_wonky_tasks': 'Has Wonky Tasks (1=Yes, 0=No)', 'avg_task_time': 'Average Task Time (seconds)'},
    category_orders={'has_wonky_tasks': [0, 1]},
    boxmean='sd'
)
fig_speed.show()

## Summary

**Key Findings:**
- Statistical tests help identify significant differences between wonky and non-wonky users
- Feature engineering creates actionable behavioral indicators
- Fraud risk scores combine multiple signals into interpretable metrics
- Both Mann-Whitney U and Welch's t-test provide validation of findings

**Next Steps:**
- Use significant features from EDA for model training
- Apply fraud risk thresholds to filter suspicious users
- Build models using features identified as significant
