In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_features,
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    compare_demographic_groups
)

from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### File Definitions

- **user_info_df**: DataFrame of respondent x task level data for all users (not just wonky studies)
- **wonky_studies_df**: DataFrame of respondents involved in studies with unexpected outcomes (negative impacts when positive expected)

A study is "wonky" if the outcome is unexpected (e.g., advertisement showed negative impacts of media, which is counter-intuitive).


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))

user_info_df = pd.read_parquet(output_path) # total user info
wonky_counts = pd.read_parquet(wonky_counts_path) # normal tasks and wonky tasks for wonky task respondents
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) # task level info for wonky task respondents
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path) # summary of wonky task respondents

In [0]:
# user_info_df[(user_info_df['respondentPk'] == '361d4dfc-97e8-439a-b969-36163260ea4b') & (user_info_df['wonky_study_count'] > 0)]['date_completed']

In [0]:
wonky_respondent_df[['balance_respondentPk', 'request-remote-addr']].groupby('balance_respondentPk').count().sort_values('request-remote-addr', ascending=False).head(10)

In [0]:
detailed_wonky_respondent_df = wonky_respondent_df.merge(user_info_df, left_on=['balance_respondentPk', 'task_pk'], right_on=['respondentPk', 'taskPk'], how='left')

In [0]:
user_info_df = user_info_df.merge(wonky_respondent_df[['balance_respondentPk', 'task_pk', 'wonky_study_count']], left_on=['balance_respondentPk', 'taskPk'], right_on=['balance_respondentPk', 'task_pk'], how='left')

In [0]:
wonky_respondent_summary.display()

In [0]:
wonky_respondent_df

In [0]:
print(user_info_df.head())

print(wonky_respondent_df.head())

df = pd.DataFrame(user_info_df.isnull().sum(), columns=['null_count'])
display(df.reset_index())

print("\nwonky_studies_df - Missing values:")
missing_wonky = wonky_respondent_df.isnull().sum()
print(missing_wonky[missing_wonky > 0])

In [0]:
wonky_counts

In [0]:
key_numeric_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality', 'task_completed']
available_cols = [col for col in key_numeric_cols if col in user_info_df.columns]
print(user_info_df[available_cols].describe())

if 'wonky_study_flag' in user_info_df.columns:
    print("\n" + "=" * 80)
    print("COMPARISON BY wonky_study_flag (Task Level)")
    print("=" * 80)
    comparison_cols = ['task_time_taken_s', 'task_points', 'risk', 'quality']
    comparison_cols = [col for col in comparison_cols if col in user_info_df.columns]
    
    if len(comparison_cols) > 0:
        wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 1]
        non_wonky_study_tasks = user_info_df[user_info_df['wonky_study_flag'] == 0]
        
        print("\nWonky Study Tasks (wonky_study_flag=1):")
        print(wonky_study_tasks[comparison_cols].describe())
        
        print("\nNon-Wonky Study Tasks (wonky_study_flag=0):")
        print(non_wonky_study_tasks[comparison_cols].describe())
        
        if 'wonky_studies_count' in user_info_df.columns:
            wonky_user_tasks = user_info_df[user_info_df['wonky_studies_count'] > 0]
            print("\nTasks from Users with Wonky Studies (wonky_studies_count > 0):")
            print(wonky_user_tasks[comparison_cols].describe())

print("\n" + "=" * 80)
print("STATISTICAL SUMMARY: wonky_studies_df")
print("=" * 80)
print(wonky_counts.describe())


### Feature Engineering

In [0]:
main_features = []

### Behavioural Stuff

#### Task amounts

In [0]:
(
    user_info_df["totaal_tasks_completed"]
    .value_counts()
    .div(len(user_info_df))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
(
    user_info_df[user_info_df['wonky_study_count'] > 0]['totaal_tasks_completed']
    .value_counts()
    .div(len(user_info_df[user_info_df['wonky_study_count'] > 0]))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
user_info_df[['totaal_tasks_completed']]

In [0]:
user_info_df, task_labels = create_task_features(user_info_df, 'totaal_tasks_completed')

In [0]:
import pandas as pd
import plotly.graph_objects as go
from typing import List


def create_temporal_breakdown_chart(
    df: pd.DataFrame,
    temporal_features: List[str],
    group_col: str = "wonky_study_count",
    group_threshold: float = 0,
) -> go.Figure:
    """
    Create Plotly chart showing percentage delta between two groups for temporal features.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with temporal features and group column
    temporal_features : List[str]
        List of temporal feature column names
    group_col : str, default "wonky_study_count"
        Column name for grouping
    group_threshold : float, default 0
        Threshold for determining wonky vs non-wonky groups

    Returns
    -------
    go.Figure
        Plotly bar chart showing deltas
    """
    # Validate group column
    if group_col not in df.columns:
        available_cols = [
            col for col in df.columns
            if 'wonky' in col.lower() or 'study' in col.lower()
        ]
        raise ValueError(
            f"Group column '{group_col}' not found. "
            f"Available: {available_cols[:10] if available_cols else 'None'}"
        )

    # Define display names
    feature_display_map = {
        'is_weekend': "Weekend tasks",
        'is_night': "Night tasks (10 PM - 6 AM)",
        'is_business_hour': "Business hour tasks (9 AM - 5 PM)",
        'is_business_hour_weekday': "Business hour tasks weekday",
        'is_business_hour_weekend': "Business hour tasks weekend",
    }

    features_data = []

    for feature in temporal_features:
        if feature not in df.columns:
            continue

        # Determine group masks
        wonky_mask = df[group_col] > group_threshold
        
        # Non-wonky: NaN if exists, else == 0
        if df[group_col].isna().sum() > 0:
            non_wonky_mask = df[group_col].isna()
        else:
            non_wonky_mask = df[group_col] == 0

        # Calculate percentages
        wonky_pct = (
            df.loc[wonky_mask, feature].mean() * 100
            if wonky_mask.sum() > 0 else 0.0
        )
        non_wonky_pct = (
            df.loc[non_wonky_mask, feature].mean() * 100
            if non_wonky_mask.sum() > 0 else 0.0
        )
        
        delta_pct = wonky_pct - non_wonky_pct

        # Get display name
        display_name = feature_display_map.get(
            feature,
            feature.replace('_', ' ').title()
        )

        features_data.append({
            'feature': display_name,
            'delta': delta_pct,
            'wonky_pct': wonky_pct,
            'non_wonky_pct': non_wonky_pct,
        })

    # Create DataFrame for easier plotting
    chart_df = pd.DataFrame(features_data)
    chart_df = chart_df.sort_values('delta', ascending=True)

    # Determine bar colors (red for negative, green for positive)
    colors = ['#ff4b4b' if x < 0 else '#51cf66' for x in chart_df['delta']]

    fig = go.Figure(
        data=[
            go.Bar(
                x=chart_df['delta'],
                y=chart_df['feature'],
                orientation='h',
                marker_color=colors,
                text=[f"{x:+.1f}%" for x in chart_df['delta']],
                textposition='auto',
                hovertemplate=(
                    "<b>%{y}</b><br>"
                    "Delta: %{x:+.1f}%<br>"
                    "<extra></extra>"
                ),
            )
        ]
    )

    fig.update_layout(
        title_text="Feature Differences: Wonky vs Non-Wonky Groups",
        xaxis_title="% Point Delta (Wonky - Non-Wonky)",
        yaxis_title="Feature",
        height=400 + (len(chart_df) * 20),
        showlegend=False,
        hovermode='closest',
        template='plotly_white',
    )

    fig.add_vline(x=0, line_dash="dash", line_color="gray", opacity=0.5)

    return fig


In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=task_labels,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
fig = create_temporal_breakdown_chart(
    user_info_df,
    temporal_features=task_labels,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
chi_square_results_days_task = perform_chi_square_tests(
    user_info_df,
    feature_set=task_labels,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_days_task.reset_index())

In [0]:
ztest_results_days_task = perform_two_proportion_z_tests(
    user_info_df, task_labels, group_var="wonky_study_count"
)
# display(ztest_results_days_active[ztest_results_days_active['significant']].reset_index())
display(ztest_results_days_task.reset_index())

In [0]:
wonky_tasknum.display()

In [0]:
wonky_tasknum = wonky_respondent_summary.merge(user_info_df[list(task_labels) + ['respondentPk', 'gender']], left_on=['balance_respondentPk'], right_on=['respondentPk'], how='left')

long = (
    wonky_tasknum[["respondentPk", "exposure_band", "gender"] + list(task_labels)]
      .melt(
          id_vars=["respondentPk", "exposure_band", "gender"],
          value_vars=task_labels,
          var_name="task_amounts",
          value_name="flag",
      )
)

long = long[long["flag"] == 1]

counts = (
    long
    .groupby(["task_amounts", "exposure_band", "gender"])
    .agg(n=("respondentPk", "nunique"))
    .reset_index()
)

counts["group_total"] = (
    counts
    .groupby(["exposure_band", "gender"])["n"]
    .transform("sum")
)
counts["pct"] = counts["n"] / counts["group_total"] * 100

counts.display()

#### Days active before task

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
import plotly.express as px

vc = user_info_df["days_active_before_task"].value_counts().sort_index()/len(user_info_df) * 100

vc_df = vc.reset_index()
vc_df.columns = ["days_active_before_task", "count"]

fig = px.line(
    vc_df,
    x="days_active_before_task",
    y="count",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Number of users (%)",
    template="plotly_white"
)

fig.show()


In [0]:
import plotly.express as px

df = user_info_df[user_info_df['wonky_study_count'] > 0]

vc = df["days_active_before_task"].value_counts().sort_index()/len(df) * 100

vc_df = vc.reset_index()
vc_df.columns = ["days_active_before_task", "count"]

fig = px.line(
    vc_df,
    x="days_active_before_task",
    y="count",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Number of wonky users (%)",
    template="plotly_white"
)

fig.show()

In [0]:
import plotly.express as px
import pandas as pd

# 1) All users
vc_all = (
    user_info_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(user_info_df) * 100
)

df_all = vc_all.reset_index()
df_all.columns = ["days_active_before_task", "percent"]
df_all["group"] = "All users"

# 2) Wonky users
wonky_df = user_info_df[user_info_df["wonky_study_count"] > 0]

vc_wonky = (
    wonky_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(wonky_df) * 100
)

df_wonky = vc_wonky.reset_index()
df_wonky.columns = ["days_active_before_task", "percent"]
df_wonky["group"] = "Wonky users"

# 3) Combine
plot_df = pd.concat([df_all, df_wonky], ignore_index=True)

# 4) Single line chart with color
fig = px.line(
    plot_df,
    x="days_active_before_task",
    y="percent",
    color="group",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Users (%)",
    template="plotly_white",
    legend_title_text="Group",
)

fig.show()


In [0]:
(user_info_df["days_active_before_task"].value_counts() / len(user_info_df)).sort_values(ascending=False).cumsum().head(10)

In [0]:
(
    user_info_df["days_active_before_task"]
    .value_counts()
    .div(len(user_info_df))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:

user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task'].value_counts()

In [0]:
(
    user_info_df[user_info_df['wonky_study_count'] > 0]['days_active_before_task']
    .value_counts()
    .div(len(user_info_df[user_info_df['wonky_study_count'] > 0]))
    .sort_index()          # sort by the index (days)
    .cumsum()
    .head(10)
)


In [0]:
user_info_df_shortened = user_info_df[['respondentPk', 'days_active_before_task', 'wonky_study_count']]

In [0]:
series = user_info_df_shortened['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df_shortened = user_info_df_shortened.join(daysactive_dummies)

In [0]:
user_info_df_shortened

##### Test

In [0]:
chi_square_results_days_active = perform_chi_square_tests(
    user_info_df_shortened,
    feature_set=daysactive_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_days_active.reset_index())

In [0]:
chi_days_active_table = chi_square_results_days_active.reset_index()

In [0]:
chi_days_active_table['days'] = chi_days_active_table['feature'].str.split('_').str[-1]

In [0]:
chi_days_active_table[['days', 'chi2']]

In [0]:
chi_days_active_table[chi_days_active_table['significant']].sort_values('days')

In [0]:
fig = px.scatter(
    chi_days_active_table[chi_days_active_table['significant']].sort_values('days'),
    x="days",
    y="chi2",
    trendline="ols"
)

fig.update_layout(
    xaxis_title="Days",
    yaxis_title="Chi-square statistic"
)

fig.show()

not directional but shows magnitude of differences
 

huge 7&6 day spike low impact for 1 day and 62 seen as smallest impact suggest danger zone of 1 week and 2 and half months (ish)

In [0]:
# redundant due to specifically for continuous data. useful for respondent features further down.

# mannwhitney_results_days_active = perform_mannwhitney_tests(
#     user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_days_active.reset_index())

In [0]:
ztest_results_days_active = perform_two_proportion_z_tests(
    user_info_df_shortened, daysactive_cols, group_var="wonky_study_count"
)
display(ztest_results_days_active[ztest_results_days_active['significant']].reset_index())

Sorting by days active (days from first task to task complete) no clear pattern but 6 & 7 days look to be more risky -> users hitting 1 week mark potentially less risky with

also potential cyclical nature of risks wonky behaviour also shown in users hitting the 1 month mark.

safest zone is 1 day mark and 0 day mark.

higher numbers from 50+ tend to reflect safer zones too indicating better behaviour deep into tenure.

potential consideration >> intial engagement is good, 1 week risk, followed by 1 month-1.5month cyclical risk >> after 2.5 months a little safer

deprioritize users exactly at 6, 7 and 48 days. prioritise 0 and 1 dayers and investigate 7 specifically as that stands out.

In [0]:
ztest_days_active_table = ztest_results_days_active[ztest_results_days_active['significant']].reset_index()
ztest_days_active_table['days'] = ztest_days_active_table['feature'].str.split('_').str[-1]
ztest_days_active_table['days'] = ztest_days_active_table['days'].astype(int)
ztest_days_active_table = ztest_days_active_table.sort_values('days')
ztest_days_active_table['days'] = ztest_days_active_table['days'].astype(str)

In [0]:
df = ztest_days_active_table[['days', 'z_statistic', 'proportion_diff']]

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["days"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.7,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["days"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Days since first task_complete",
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff (%pp)", secondary_y=True)

fig.show()

In [0]:
main_features += list(daysactive_cols)
main_features += ['days_active_before_task']

#### Temporal Feature Analysis & Breakdowns - STRONG HYPOTHESIS

Analyzing temporal patterns to identify differences between wonky and non-wonky study tasks.


In [0]:
user_info_df['wonky_task_instances'].unique()

In [0]:
# Create time features using modular function
user_info_df = create_time_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
temporal_features = [
    "is_weekend",
    "is_night",
    "is_business_hour",
    "is_business_hour_weekday",
    "is_business_hour_weekend",
    "is_monday",
    "is_tuesday",
    "is_wednesday",
    "is_thursday",
    "is_friday",
    "is_saturday",
    "is_sunday",
]

print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=temporal_features,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
fig = create_temporal_breakdown_chart(
    user_info_df,
    temporal_features=temporal_features,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

In [0]:
user_info_df[temporal_features]

**Task complete time - definately good gauge. Majority takes place during business hours, relatively evenly spread across the work week LARGEST detla where wonky is more prevalent is in business hours suggesting professional behaviours
**

##### Tests for Temporal Features

Testing independence between temporal features and wonky study participation.
Chi-squared test determines if temporal patterns differ significantly between wonky and non-wonky groups.


In [0]:
sorted(user_info_df.columns)

In [0]:
user_info_df[temporal_features + ['wonky_study_count']]

In [0]:
chi_square_results_temporal_features = perform_chi_square_tests(
    user_info_df,
    feature_set=temporal_features,
    group_var='wonky_study_count',
    significance_level=0.01
)
display(chi_square_results_temporal_features.reset_index())

In [0]:
# mannwhitney_result_temporal_feature = perform_mannwhitney_tests(user_info_df, temporal_features, group_var='wonky_study_count')
# display(mannwhitney_result_temporal_feature.reset_index())

In [0]:
# welch_results_temporal_feature = perform_welch_ttests(user_info_df, temporal_features, group_var='wonky_study_count')
# display(welch_results_temporal_feature.reset_index())

In [0]:
ztest_results_temporal_feature = perform_two_proportion_z_tests(user_info_df, temporal_features, group_var='wonky_study_count')
display(ztest_results_temporal_feature[ztest_results_temporal_feature['significant']].reset_index())

In [0]:
df = ztest_results_temporal_feature.reset_index()

In [0]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()



going with chi squared and z test due to binary outcome

chi is good for magnitude (just association) and z test give directionality 

chi2 - strong association across all but biggest magnitude is friday and business hours; weekend and mid week less of a signal >> business hours x [friday, monday, thursday] could be indicative 

ztest - friday (huge effect), is business hour and is busines hour weekday largest impact for wonky studies. lower risk days are mondays, thursday and night.
- no strong signals from weekends and mid week (tuesday wednesday)

all pretty solid (largely due to large sample)

potential end of week rush effect for friday (earning targets, fatigue or rushing for beers)

friday shows 8pp difference pro wonky
business hours show 5-6pp difference pro wonky
- multi tasking with jobs or potentially using work devices (to investigate)
- monday, thursday dampen business hour effects.

business hour effect likely driven by friday effect

night time effect supported by chi2 and ztest 2.3pp difference

takeaway:
users start week with higher focus
mid week is pretty okay (nothing standing out too much)

may be an engagement & rushing problem with friday being main  culprit

things to consider: selection bias -> users complete surveys on these days at varying levels of engagement 
- responsible work focused people might do it on monday as a routine
- casual or rush people tend to do this on fridays
- therefore observe monday as safter and friday as less risky

In [0]:
# import numpy as np
# import pandas as pd
# from scipy import stats

# def analyze_selection_bias(df, day_safe_col='is_monday', day_risky_col='is_friday', min_tasks=5, wonky_col='wonky_study_count'):
#     """
#     Analyzes within-respondent variation to detect selection bias vs real day effects.
    
#     Args:
#         df: The user_info_df containing task-level data
#         day_safe_col: The column name for the "Safe" day (e.g., 'is_monday')
#         day_risky_col: The column name for the "Risky" day (e.g., 'is_friday')
#         min_tasks: Minimum tasks PER DAY required to be included (default 5)
#         wonky_col: The target variable (e.g., 'wonky_study_count')
#     """
    
#     print(f"--- Selection Bias Analysis: {day_safe_col} vs {day_risky_col} ---")
    
#     # 1. Prepare working data
#     work_df = df.copy()
    
#     # Define day type for aggregation
#     conditions = [
#         work_df[day_safe_col] == 1,
#         work_df[day_risky_col] == 1
#     ]
#     choices = ['safe_day', 'risky_day']
#     work_df['analysis_day_type'] = np.select(conditions, choices, default='other')
    
#     # Filter to only relevant rows
#     work_df = work_df[work_df['analysis_day_type'] != 'other']
    
#     # Create binary flag for rate calculation (did this task have ANY wonkiness?)
#     work_df['is_wonky_event'] = (work_df[wonky_col] > 0).astype(int)
    
#     # 2. Group by Respondent -> Calculate Rates
#     # We sum the wonky events and count total tasks per day-type per user
#     respondent_stats = work_df.groupby(['respondentPk', 'analysis_day_type']).agg(
#         total_tasks=('respondentPk', 'count'),
#         wonky_events=('is_wonky_event', 'sum')
#     ).reset_index()
    
#     respondent_stats['wonky_rate'] = respondent_stats['wonky_events'] / respondent_stats['total_tasks']
    
#     # 3. Pivot to put Day 1 and Day 2 side-by-side for each user
#     user_pivot = respondent_stats.pivot(
#         index='respondentPk', 
#         columns='analysis_day_type', 
#         values=['wonky_rate', 'total_tasks']
#     )
    
#     # Flatten column names (e.g., 'wonky_rate_risky_day', 'total_tasks_safe_day')
#     user_pivot.columns = [f'{col[0]}_{col[1]}' for col in user_pivot.columns]
#     user_pivot = user_pivot.reset_index()
    
#     # 4. Filter for Multi-Day Respondents (The "Control Group")
#     # Users must have enough volume on BOTH days to be statistically useful
#     valid_users = user_pivot[
#         (user_pivot['total_tasks_safe_day'] >= min_tasks) & 
#         (user_pivot['total_tasks_risky_day'] >= min_tasks)
#     ].copy()
    
#     n_users = len(valid_users)
#     print(f"Found {n_users} respondents with >= {min_tasks} tasks on BOTH days.")
    
#     if n_users < 10:
#         print("Not enough users for robust statistical inference. Try lowering 'min_tasks'.")
#         return None

#     # 5. Calculate Within-Person Difference
#     # Positive Diff means Risky Day is TRULY riskier for the same person
#     valid_users['risk_diff'] = valid_users['wonky_rate_risky_day'] - valid_users['wonky_rate_safe_day']
    
#     mean_safe_rate = valid_users['wonky_rate_safe_day'].mean()
#     mean_risky_rate = valid_users['wonky_rate_risky_day'].mean()
#     mean_diff = valid_users['risk_diff'].mean()
    
#     # Paired T-Test (Is the difference statistically significant?)
#     t_stat, p_val = stats.ttest_rel(valid_users['wonky_rate_risky_day'], valid_users['wonky_rate_safe_day'])
    
#     print(f"\nRESULTS:")
#     print(f"1. {day_safe_col} Average Wonky Rate: {mean_safe_rate:.2%}")
#     print(f"2. {day_risky_col} Average Wonky Rate: {mean_risky_rate:.2%}")
#     print(f"3. Mean Difference (Effect Size): {mean_diff:+.2%} pts")
#     print(f"4. Significance (p-value): {p_val:.5f}")
    
#     if p_val < 0.05:
#         print("SIGNIFICANT: The day effect is REAL. The same user performs differently on these days.")
#     else:
#         print("NOT SIGNIFICANT: Selection Bias confirmed. The variation is due to WHO is working, not WHEN.")
        
#     return valid_users

# # --- EXECUTE ---
# selection_bias_df = analyze_selection_bias(user_info_df, day_safe_col='is_monday', day_risky_col='is_friday', min_tasks=5)


controlling for the selection bias and looking at average wonky rates for friday being almost double monday! [using paired t test] suggesting the day effect might be real

In [0]:
main_features += list(temporal_features)

STRONG SIGNIFICANT READ AT 99% LEVEL LARGEST MAGNITUDE FOUND AT NIGHT. LOWEST MAGNITUDE DURING WEEKEND

**Bar is the level of siginficance between the wonky and non wonky, the line are the delta's between wonky and non wonky in terms of when tasks are complete.

positive delta means wonky participants are more prevalent and negative delta means they are less prevalent.

Business hours, Night time, Saturdays look like the overall best separators between wonky and non wonky participants in terms of task complete time
**

#### Task speed features - OKAY HYPOTHESIS

In [0]:
# capping because of very anomalous time throwing off the average

user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
sorted(user_info_df.columns)

In [0]:
[col for col in user_info_df.columns if 'length' in col]

In [0]:
user_info_df['task_time_taken_s_capped'].max()

In [0]:
user_info_df

In [0]:
# user_info_df = create_task_speed_features(
#     user_info_df,
#     task_time_col="task_time_taken_s_capped",
#     use_std_dev=True,
#     group_by_col="task_length_of_task",
#     min_group_size=5    
# )

# mean_time = user_info_df["task_time_taken_s_capped"].mean()
# std_time = user_info_df["task_time_taken_s_capped"].std()
# print(f"Task time statistics:")
# print(f"  Mean: {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
# print(f"  Std Dev: {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
# print(f"  Fast threshold (mean - 1σ): {mean_time - std_time:.2f}s")
# print(f"  Suspiciously fast threshold (mean - 2σ): {mean_time - 2*std_time:.2f}s")
# print(f"  Slow threshold (mean + 1σ): {mean_time + std_time:.2f}s")
# print(f"  Suspiciously slow threshold (mean + 2σ): {mean_time + 2*std_time:.2f}s")
# print()

# # Display breakdown with wonky vs non-wonky comparison
# print(create_task_speed_breakdown_summary(
#     user_info_df,
#     group_col='wonky_study_count',
#     group_threshold=0
# ))

**Wonky participants are usually suspcisouly fast to normal non wonky participants tend to be normal to supcisouly slow in terms of delta **

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

fast_threshold = user_info_df["task_time_taken_s"].quantile(0.16)
suspiciously_fast_threshold = user_info_df["task_time_taken_s"].quantile(0.025)
slow_threshold = user_info_df["task_time_taken_s"].quantile(0.84)
suspiciously_slow_threshold = user_info_df["task_time_taken_s"].quantile(0.975)

# Also calculate trimmed mean/std for reference (trimming extreme outliers)
trimmed_data = user_info_df["task_time_taken_s"].clip(
    lower=user_info_df["task_time_taken_s"].quantile(0.01),
    upper=user_info_df["task_time_taken_s"].quantile(0.99)
)
mean_time = trimmed_data.mean()
std_time = trimmed_data.std()

print(f"Task time statistics (using percentiles, robust to outliers):")
print(f"  Mean (trimmed 1%-99%): {mean_time:.2f} seconds ({mean_time/60:.2f} minutes)")
print(f"  Std Dev (trimmed 1%-99%): {std_time:.2f} seconds ({std_time/60:.2f} minutes)")
print(f"  Fast threshold (16th percentile): {fast_threshold:.2f}s ({fast_threshold/60:.2f} min)")
print(f"  Suspiciously fast threshold (2.5th percentile): {suspiciously_fast_threshold:.2f}s ({suspiciously_fast_threshold/60:.2f} min)")
print(f"  Slow threshold (84th percentile): {slow_threshold:.2f}s ({slow_threshold/60:.2f} min)")
print(f"  Suspiciously slow threshold (97.5th percentile): {suspiciously_slow_threshold:.2f}s ({suspiciously_slow_threshold/60:.2f} min)")
print()

# Display breakdown with wonky vs non-wonky comparison
group_col_to_use = 'wonky_task_instances' if 'wonky_task_instances' in user_info_df.columns else 'wonky_study_count'
print(create_task_speed_breakdown_summary(
    user_info_df,
    group_col=group_col_to_use,
    group_threshold=0
))

In [0]:
user_info_df['task_length_of_task'].unique()

##### Tests

In [0]:
speed_features = ['is_suspiciously_fast', 'is_fast', 'is_normal_speed', 'is_slow', 'is_suspiciously_slow']

In [0]:
user_info_df[speed_features + ['wonky_study_count']]

In [0]:
chi_square_results_task_speeds = perform_chi_square_tests(
    user_info_df,
    feature_set=speed_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_task_speeds.reset_index())

In [0]:
ztest_results_task_speeds = perform_two_proportion_z_tests(
    user_info_df, speed_features, group_var="wonky_study_count"
)
display(ztest_results_task_speeds.reset_index())

In [0]:
main_features += speed_features

normal speed has highest magnitude and positive direction in z test -> counter intuitive -> problem could be comprehension rather than rushing/gaming

fast defined as -> 1 standard deviation faster than the average time of the group

fast seems relative safe and slow is safe suggesting might not be a speed thing and more comprehension issue


TODO - calibrate speeds to account for points or survey types (unsure which is best indicator - ask tim or dan)

TODO -> Come up with good Viz across test

#### Device

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=hardware_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

huge delta for desktop, but this might be a sample thing. may converge better with 3-6 months of data.

###### Tests

In [0]:
user_info_df[list(hardware_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_hardware = perform_chi_square_tests(
    user_info_df,
    feature_set=hardware_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_hardware.reset_index())

In [0]:
ztest_results_hardware = perform_two_proportion_z_tests(
    user_info_df, hardware_cols, group_var="wonky_study_count"
)
display(ztest_results_hardware[ztest_results_hardware['significant']].reset_index())

In [0]:
user_info_df['hardware_version'].unique()

In [0]:
user_info_df[['hardware_version']].fillna('unknown').value_counts()/len(user_info_df)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0][['hardware_version']].fillna('unknown').value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

high signal in desktops and iphones others are more neglible. largely driven by volume of general participants but seems like needs to be included in model

something to be investiagated in desktop process especially as usually iphone users tend to be high quality

In [0]:
main_features += ['Desktop', 'Iphone']

#### Platform

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=platform_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
user_info_df[['platform_name']].fillna('unknown').value_counts()/len(user_info_df)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0][['platform_name']].fillna('unknown').value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

##### Tests

In [0]:
user_info_df[list(platform_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_platform = perform_chi_square_tests(
    user_info_df,
    feature_set=platform_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_platform.reset_index())

In [0]:
ztest_results_platform = perform_two_proportion_z_tests(
    user_info_df, platform_cols, group_var="wonky_study_count"
)
display(ztest_results_platform.reset_index())

In [0]:
user_info_df[['Linux', 'iOS', 'Unknown', 'Mac OS X', 'Android', 'Windows']].sum() / len(user_info_df)

Similarly to desktop big magnitude in Linux and iOS but could be just due to volumes

Linux is a platofmr not used by most people though >> needs a follow up question
- sometimes used alot by bot farmers etc

In [0]:
main_features += ['Linux', 'iOS']

### Demographic stuff

#### Gambling

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_cols

In [0]:
print(
    create_temporal_breakdown_summary(
        user_info_df,
        temporal_features=gambling_cols,
        group_col="wonky_study_count",
        group_threshold=0,
    )
)

In [0]:
fig = create_temporal_breakdown_chart(
    user_info_df,
    temporal_features=gambling_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

###### Tests

In [0]:
user_info_df[list(gambling_cols) + ['wonky_study_count']]

In [0]:
chi_square_results_gambling = perform_chi_square_tests(
    user_info_df,
    feature_set=gambling_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_gambling.reset_index())

In [0]:
# mannwhitney_results_gambling = perform_mannwhitney_tests(
#     user_info_df, gambling_cols, group_var="wonky_study_count"
# )
# display(mannwhitney_results_gambling.reset_index())

In [0]:

# welch_results_gambling = perform_welch_ttests(
#     user_info_df, gambling_cols, group_var="wonky_study_count"
# )
# display(welch_results_gambling.reset_index())

In [0]:
ztest_results_gambling = perform_two_proportion_z_tests(
    user_info_df, gambling_cols, group_var="wonky_study_count"
)
display(ztest_results_gambling.reset_index())

In [0]:
df = ztest_results_gambling.reset_index()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


counter to the expected results almost every form of gambling associated with lower risk to wonky studies suggests less of a gaming issue again and more of a comprehension issue.

maybe only gamblers are actually better at reading and understanding what they're doing online (attention to detail etc) resulting in less wonky studies

In [0]:
main_features += list(gambling_cols)

#### Income

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)

In [0]:
(user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)).T

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns

user_info_df = user_info_df.join(income_dummies)

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['fulcrum_household_income_mapped'].value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

In [0]:
user_info_df[list(income_cols) + ['wonky_study_count']]

In [0]:
fig = create_temporal_breakdown_chart(
    user_info_df,
    temporal_features=income_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

most wonky studies are in lower income groups

##### Test

In [0]:
chi_square_results_income = perform_chi_square_tests(
    user_info_df,
    feature_set=income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income.reset_index())

In [0]:
ztest_results_income = perform_two_proportion_z_tests(
    user_info_df, income_cols, group_var="wonky_study_count"
)
display(ztest_results_income.reset_index())

In [0]:
df = ztest_results_income.reset_index()

In [0]:
def income_lower_bound(s):
    s = str(s)
    if "Less than" in s:
        return 0
    if "Prefer not" in s:
        return float("inf")
    s_clean = s.replace("£", "").replace(",", "")
    if "to" in s_clean:
        return float(s_clean.split("to")[0])
    if "and above" in s_clean:
        return float(s_clean.split("and")[0])
    return float("inf")

In [0]:
df['feature'].map(income_lower_bound)

In [0]:
df['new_income_col'] = df['feature'].map(income_lower_bound)

In [0]:
df

In [0]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

df = df.sort_values(by='new_income_col')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["new_income_col"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["new_income_col"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


In [0]:
df["x_num"] = df["new_income_col"]  # from earlier parsing

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["x_num"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["x_num"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    bargap=0.05,
    bargroupgap=0.0,
    xaxis=dict(
        title="Income band",
        tickmode="array",
        tickvals=df["x_num"],
        ticktext=df["feature"],
    ),
)

fig.update_xaxes(tickangle=-45)

In [0]:
main_features += list(income_cols)

In [0]:
len(main_features)

#### Income Gender

In [0]:
series = user_info_df['gender']

# One-hot encode each gambling mode
gender_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_dummies = gender_dummies.add_prefix('gender_')

gender_cols = gender_dummies.columns

user_info_df = user_info_df.join(gender_dummies)

In [0]:
main_features += list(gender_cols) 
main_features

In [0]:
user_info_df['gender_fulcrum_household_income_mapped'] = user_info_df['gender'].astype(str) + "_" + user_info_df['fulcrum_household_income_mapped'].astype(str)

In [0]:
series = user_info_df['gender_fulcrum_household_income_mapped']

# One-hot encode each gambling mode
gender_income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_income_cols = gender_income_dummies.columns

user_info_df = user_info_df.join(gender_income_dummies)

In [0]:
(user_info_df["gender_fulcrum_household_income_mapped"].value_counts() / len(user_info_df))

In [0]:
user_info_df[user_info_df['wonky_study_count'] > 0]['gender_fulcrum_household_income_mapped'].value_counts()/len(user_info_df[user_info_df['wonky_study_count'] > 0])

In [0]:
user_info_df[list(gender_income_cols) + ['wonky_study_count']]

In [0]:
fig = create_temporal_breakdown_chart(
    user_info_df,
    temporal_features=gender_income_cols,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

strong cut off category of female less than 15k

##### Tests

In [0]:
chi_square_results_income_gender = perform_chi_square_tests(
    user_info_df,
    feature_set=gender_income_cols,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(chi_square_results_income_gender[chi_square_results_income_gender['significant']].reset_index())

In [0]:
ztest_results_income_gender = perform_two_proportion_z_tests(
    user_info_df, gender_income_cols, group_var="wonky_study_count"
)
display(ztest_results_income_gender[ztest_results_income_gender['significant']].reset_index())

In [0]:
(
    df["feature"]
    .str.replace("male_|m", "M")
    .str.replace("fe|f", "F")
    .str.replace(",000", "k")
    .str.replace(",999", "k")
    .str.replace(" and above", "+")
    .str.replace("prer not to say", "no_answer")
    .str.replace("Prer not to answer", "no_answer")
    .str.replace("Less than ", "<")
)

In [0]:
import re
import numpy as np

# assume df is your DataFrame and 'feature' is the column name
def extract_max_salary(s: str) -> float:
    # handle things like "M£200k+", "FM£100k to £124k", "FM<£15k", "Mno_answer", "None_nan"
    m = re.search(r"£(\d+)\s*k\+?", s)              # single bound, possibly with +
    if m:
        return float(m.group(1)) * 1000

    m = re.search(r"£(\d+)\s*k\s*to\s*£(\d+)\s*k", s)  # range "£100k to £124k"
    if m:
        return float(m.group(2)) * 1000

    m = re.search(r"<£(\d+)\s*k", s)               # "<£15k"
    if m:
        return float(m.group(1)) * 1000

    return np.nan

df["max_salary"] = df["feature"].apply(extract_max_salary)

In [0]:
df

In [0]:
df = ztest_results_income_gender.reset_index()

df["feature"] = (
    df["feature"]
    .str.replace("prefer not to say", "no_answer")
    .str.replace("Prefer not to answer", "no_answer")
    .str.replace(",000", "k")
    .str.replace(",999", "k")
    .str.replace(" and above", "+")
    .str.replace("Less than ", "<")
    .str.replace("female_", "F_")
    .str.replace("male_", "M_")
)

df["max_salary"] = df["feature"].apply(extract_max_salary)

df = df.sort_values('max_salary')

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=df["feature"],
        y=df["z_statistic"],
        name="z statistic",
        marker_color="steelblue",
        opacity=0.8,
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(
        x=df["feature"],
        y=df["proportion_diff"],
        name="proportion diff",
        mode="lines+markers",
        line=dict(color="indianred", width=2),
    ),
    secondary_y=True,
)

fig.update_layout(
    xaxis_title="Feature",
    yaxis_title="z statistic",
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
)

fig.update_yaxes(title_text="z statistic", secondary_y=False)
fig.update_yaxes(title_text="proportion diff", secondary_y=True)

fig.show()


aim is to determine if there is a socio economic & gender divide in survey quality

lower incomes seem to carry significantly higher risk (again possible a comprehension or difficulty understanding the task issue)

biggest safe zones are 100k + incomes 
(200k lowest risk -5.1pp risk) with strong significance

high income correlating with less stress about money better quality answer, tech literacy and other motivations other than money for filling in surveys

highest risk in the 25k-30k category (+2.8pp)

high earning males statistically safest demograpic -> tech literate & professional demographic

riskiest cohorts:
- females 45k-50k (+1.8pp)

safe cohorts (both genders 100k+)
less safe (males < 100k, females 25k-50k)
high risk (females 25k-50k)

professionals with high income correlate with high quality.

low income shouldn't be blocked but managed >> consider other tests or qualifiers for comprehension ability

OVERALL FROM BASIC STAT TESTS KEY TAKE AWAYS

Days Active: Day 7 is critical risk.

Temporal: Friday & Business Hours are risky; Nights are safe.

Speed: Normal speed is paradoxically risky; Fast is safe.

Device/Platform: Desktop/Linux/iPhone are critical risks.

Gambling: Gamblers are safe/smart users.

Demographics: High earners are safe.

In [0]:
sorted(user_info_df.columns)

In [0]:
wonky_inc = wonky_respondent_summary.merge(user_info_df[list(income_cols) + ['respondentPk', 'gender']], left_on=['balance_respondentPk'], right_on=['respondentPk'], how='left')

income_long = (
    wonky_inc[["respondentPk", "exposure_band", "gender"] + list(income_cols)]
      .melt(
          id_vars=["respondentPk", "exposure_band", "gender"],
          value_vars=income_cols,
          var_name="income_bracket",
          value_name="flag",
      )
)

income_long = income_long[income_long["flag"] == 1]

counts = (
    income_long
    .groupby(["income_bracket", "exposure_band", "gender"])
    .agg(n=("respondentPk", "nunique"))
    .reset_index()
)

counts["group_total"] = (
    counts
    .groupby(["exposure_band", "gender"])["n"]
    .transform("sum")
)
counts["pct"] = counts["n"] / counts["group_total"] * 100

counts.display()

In [0]:
wonky_inc.display()

In [0]:
# Prepare groups
user_info_df['wonky_group'] = (user_info_df['wonky_study_count'] > 0).astype(int)
wonky_quality = user_info_df[user_info_df['wonky_group'] == 1]['quality'].dropna()
non_wonky_quality = user_info_df[user_info_df['wonky_group'] == 0]['quality'].dropna()

#### Risk

In [0]:
# # Descriptive Statistics: Risk scores by group
# print("=" * 80)
# print("RISK SCORE DESCRIPTIVE STATISTICS")
# print("=" * 80)

# # Prepare groups (wonky_group already created in quality analysis)
# wonky_risk = user_info_df[user_info_df['wonky_group'] == 1]['risk'].dropna()
# non_wonky_risk = user_info_df[user_info_df['wonky_group'] == 0]['risk'].dropna()

# print(f"\nWonky Group (wonky_study_count > 0):")
# print(f"  Count: {len(wonky_risk):,}")
# print(f"  Mean: {wonky_risk.mean():.2f}")
# print(f"  Median: {wonky_risk.median():.2f}")
# print(f"  Std Dev: {wonky_risk.std():.2f}")
# print(f"  Min: {wonky_risk.min():.2f}")
# print(f"  Max: {wonky_risk.max():.2f}")
# print(f"  25th percentile: {wonky_risk.quantile(0.25):.2f}")
# print(f"  75th percentile: {wonky_risk.quantile(0.75):.2f}")

# print(f"\nNon-Wonky Group (wonky_study_count == 0):")
# print(f"  Count: {len(non_wonky_risk):,}")
# print(f"  Mean: {non_wonky_risk.mean():.2f}")
# print(f"  Median: {non_wonky_risk.median():.2f}")
# print(f"  Std Dev: {non_wonky_risk.std():.2f}")
# print(f"  Min: {non_wonky_risk.min():.2f}")
# print(f"  Max: {non_wonky_risk.max():.2f}")
# print(f"  25th percentile: {non_wonky_risk.quantile(0.25):.2f}")
# print(f"  75th percentile: {non_wonky_risk.quantile(0.75):.2f}")

# print(f"\nDifference (Wonky - Non-Wonky):")
# risk_mean_diff = wonky_risk.mean() - non_wonky_risk.mean()
# risk_median_diff = wonky_risk.median() - non_wonky_risk.median()
# print(f"  Mean difference: {risk_mean_diff:+.2f}")
# print(f"  Median difference: {risk_median_diff:+.2f}")
# if non_wonky_risk.mean() > 0:
#     print(f"  % difference: {(risk_mean_diff / non_wonky_risk.mean() * 100):+.2f}%")


In [0]:
# Statistical Tests: Mann-Whitney U and Welch's t-test for Risk
risk_features = ['risk']

print("=" * 80)
print("STATISTICAL TESTS: Risk Score Comparison")
print("=" * 80)

# Mann-Whitney U Test (non-parametric)
mannwhitney_results_risk = perform_mannwhitney_tests(
    user_info_df,
    feature_set=risk_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nMann-Whitney U Test Results:")
print("-" * 80)
display(mannwhitney_results_risk.reset_index())

# Welch's t-test (parametric, handles unequal variances)
welch_results_risk = perform_welch_ttests(
    user_info_df,
    feature_set=risk_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nWelch's t-test Results:")
print("-" * 80)
display(welch_results_risk.reset_index())

In [0]:
# Distribution Comparison: Histogram
fig_risk_hist = create_distribution_comparison(
    user_info_df,
    feature='risk',
    group_col='wonky_group',
    group1_value=1,
    group2_value=0,
    plot_type='histogram',
    group1_name='Wonky',
    group2_name='Non-Wonky',
    title='Risk Score Distribution: Wonky vs Non-Wonky',
    nbins=50,
    opacity=0.7
)
fig_risk_hist.show()

In [0]:
# Box Plot Comparison
fig_risk_box = create_distribution_comparison(
    user_info_df,
    feature='risk',
    group_col='wonky_group',
    group1_value=1,
    group2_value=0,
    plot_type='box',
    group1_name='Wonky',
    group2_name='Non-Wonky',
    title='Risk Score Distribution: Wonky vs Non-Wonky (Box Plot)'
)
fig_risk_box.show()


In [0]:
# Create risk bins (using percentiles to ensure balanced bins)
risk_percentiles = user_info_df['risk'].quantile([0, 0.25, 0.5, 0.75, 1.0]).values
risk_bins = [risk_percentiles[0], risk_percentiles[1], risk_percentiles[2], risk_percentiles[3], risk_percentiles[4]]

# Ensure bins are unique and handle edge cases
if len(set(risk_bins)) < len(risk_bins):
    # Use fixed bins if percentiles create duplicates
    risk_bins = [0, 0.25, 0.5, 0.75, 1.0]
    risk_labels = ['Very Low (Q1)', 'Low (Q2)', 'Medium (Q3)', 'High (Q4)']
else:
    risk_labels = ['Very Low (Q1)', 'Low (Q2)', 'Medium (Q3)', 'High (Q4)']

user_info_df['risk_bin'] = pd.cut(
    user_info_df['risk'],
    bins=risk_bins,
    labels=risk_labels,
    include_lowest=True,
    duplicates='drop'
)

# Calculate wonky rates by risk bin
risk_bin_summary = (
    user_info_df.groupby('risk_bin')
    .agg({
        'wonky_group': ['sum', 'count', 'mean'],
        'risk': 'mean'
    })
    .reset_index()
)

risk_bin_summary.columns = ['risk_bin', 'wonky_count', 'total_count', 'wonky_rate', 'avg_risk']
risk_bin_summary['wonky_rate_pct'] = risk_bin_summary['wonky_rate'] * 100
risk_bin_summary['non_wonky_count'] = risk_bin_summary['total_count'] - risk_bin_summary['wonky_count']

print("=" * 80)
print("RISK BIN ANALYSIS: Wonky Rates by Risk Level")
print("=" * 80)
display(risk_bin_summary[['risk_bin', 'total_count', 'wonky_count', 'non_wonky_count', 
                          'wonky_rate_pct', 'avg_risk']])


In [0]:
# Chi-square test on risk bins
chi_square_results_risk_bins = perform_chi_square_tests(
    user_info_df,
    feature_set=['risk_bin'],
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nChi-Square Test Results (Risk Bins):")
print("-" * 80)
display(chi_square_results_risk_bins.reset_index())

# Two-proportion z-tests for each bin vs overall
print("\nTwo-Proportion Z-Tests (Each Risk Bin vs Overall):")
print("-" * 80)

# Create binary features for each bin
for bin_label in risk_labels:
    if pd.notna(bin_label):
        bin_feature_name = f"is_{bin_label.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_')}"
        user_info_df[bin_feature_name] = (user_info_df['risk_bin'] == bin_label).astype(int)

risk_bin_features = [col for col in user_info_df.columns if col.startswith('is_') and 'risk' in col.lower() or any(label.replace(' ', '_').replace('(', '').replace(')', '').replace('-', '_') in col for label in risk_labels if pd.notna(label))]

# Filter to actual risk bin features
risk_bin_features = [col for col in user_info_df.columns if col.startswith('is_') and ('Very_Low' in col or 'Low_Q' in col or 'Medium' in col or 'High_Q' in col)]

if len(risk_bin_features) > 0:
    ztest_results_risk_bins = perform_two_proportion_z_tests(
        user_info_df,
        feature_set=risk_bin_features,
        group_var='wonky_study_count',
        significance_level=0.01
    )
    display(ztest_results_risk_bins.reset_index())


In [0]:
# Analyze wonky rates at different risk thresholds
# Use percentiles to create meaningful thresholds
risk_percentiles_list = [0, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1.0]
risk_thresholds = [user_info_df['risk'].quantile(p) for p in risk_percentiles_list]
risk_thresholds = sorted(list(set(risk_thresholds)))  # Remove duplicates

threshold_results_risk = []

for i, threshold in enumerate(risk_thresholds):
    if i == len(risk_thresholds) - 1:
        mask = user_info_df['risk'] == threshold
        label = f"Risk = {threshold:.3f}"
    else:
        mask = user_info_df['risk'] <= threshold
        label = f"Risk ≤ {threshold:.3f}"
    
    if mask.sum() > 0:
        total = mask.sum()
        wonky_count = user_info_df.loc[mask, 'wonky_group'].sum()
        wonky_rate = wonky_count / total if total > 0 else 0
        avg_risk = user_info_df.loc[mask, 'risk'].mean()
        
        threshold_results_risk.append({
            'threshold': label,
            'total_tasks': total,
            'wonky_count': wonky_count,
            'wonky_rate': wonky_rate,
            'wonky_rate_pct': wonky_rate * 100,
            'avg_risk': avg_risk
        })

threshold_df_risk = pd.DataFrame(threshold_results_risk)

print("=" * 80)
print("RISK THRESHOLD ANALYSIS: Wonky Rates at Different Risk Levels")
print("=" * 80)
display(threshold_df_risk)


In [0]:
# Visualize wonky rate by risk threshold
fig_risk_threshold = px.bar(
    threshold_df_risk,
    x='threshold',
    y='wonky_rate_pct',
    title='Wonky Rate by Risk Threshold',
    labels={'wonky_rate_pct': 'Wonky Rate (%)', 'threshold': 'Risk Threshold'},
    color='wonky_rate_pct',
    color_continuous_scale='Reds'
)
fig_risk_threshold.update_layout(
    xaxis_title="Risk Threshold",
    yaxis_title="Wonky Rate (%)",
    showlegend=False
)
fig_risk_threshold.show()


#### Quality

In [0]:
user_info_df['quality'].unique()

In [0]:
(user_info_df['quality'].fillna(0).value_counts() / len(user_info_df)).reset_index().display()

In [0]:
(user_info_df[user_info_df['wonky_study_count'] > 0]['quality'].value_counts() / len(user_info_df[user_info_df['wonky_study_count'] > 0])).reset_index().display()

In [0]:
# Descriptive Statistics: Quality scores by group
print("=" * 80)
print("QUALITY SCORE DESCRIPTIVE STATISTICS")
print("=" * 80)

print(f"\nWonky Group (wonky_study_count > 0):")
print(f"  Count: {len(wonky_quality):,}")
print(f"  Mean: {wonky_quality.mean():.2f}")
print(f"  Median: {wonky_quality.median():.2f}")
print(f"  Std Dev: {wonky_quality.std():.2f}")
print(f"  Min: {wonky_quality.min():.2f}")
print(f"  Max: {wonky_quality.max():.2f}")
print(f"  25th percentile: {wonky_quality.quantile(0.25):.2f}")
print(f"  75th percentile: {wonky_quality.quantile(0.75):.2f}")

print(f"\nNon-Wonky Group (wonky_study_count == 0):")
print(f"  Count: {len(non_wonky_quality):,}")
print(f"  Mean: {non_wonky_quality.mean():.2f}")
print(f"  Median: {non_wonky_quality.median():.2f}")
print(f"  Std Dev: {non_wonky_quality.std():.2f}")
print(f"  Min: {non_wonky_quality.min():.2f}")
print(f"  Max: {non_wonky_quality.max():.2f}")
print(f"  25th percentile: {non_wonky_quality.quantile(0.25):.2f}")
print(f"  75th percentile: {non_wonky_quality.quantile(0.75):.2f}")

print(f"\nDifference (Wonky - Non-Wonky):")
mean_diff = wonky_quality.mean() - non_wonky_quality.mean()
median_diff = wonky_quality.median() - non_wonky_quality.median()
print(f"  Mean difference: {mean_diff:+.2f}")
print(f"  Median difference: {median_diff:+.2f}")
print(f"  % difference: {(mean_diff / non_wonky_quality.mean() * 100):+.2f}%")


In [0]:
# Statistical Tests: Mann-Whitney U and Welch's t-test
quality_features = ['quality', 'risk']

print("=" * 80)
print("STATISTICAL TESTS: Quality Score Comparison")
print("=" * 80)

# Mann-Whitney U Test (non-parametric)
mannwhitney_results_quality = perform_mannwhitney_tests(
    user_info_df,
    feature_set=quality_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nMann-Whitney U Test Results:")
print("-" * 80)
display(mannwhitney_results_quality.reset_index())

# Welch's t-test (parametric, handles unequal variances)
welch_results_quality = perform_welch_ttests(
    user_info_df,
    feature_set=quality_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nWelch's t-test Results:")
print("-" * 80)
display(welch_results_quality.reset_index())


In [0]:
# Distribution Comparison: Histogram
fig_hist = create_distribution_comparison(
    user_info_df,
    feature='quality',
    group_col='wonky_group',
    group1_value=1,
    group2_value=0,
    plot_type='histogram',
    group1_name='Wonky',
    group2_name='Non-Wonky',
    title='Quality Score Distribution: Wonky vs Non-Wonky',
    nbins=50,
    opacity=0.7
)
fig_hist.show()


In [0]:
# Distribution Comparison: Histogram
fig_hist = create_distribution_comparison(
    user_info_df,
    feature='risk',
    group_col='wonky_group',
    group1_value=1,
    group2_value=0,
    plot_type='histogram',
    group1_name='Wonky',
    group2_name='Non-Wonky',
    title='Quality Score Distribution: Wonky vs Non-Wonky',
    nbins=50,
    opacity=0.7
)
fig_hist.show()

In [0]:
# Create quality bins
user_info_df['quality_bin'] = pd.cut(
    user_info_df['quality'],
    bins=[0, 50, 75, 90, 100],
    labels=['Low (0-50)', 'Medium (51-75)', 'High (76-90)', 'Very High (91-100)'],
    include_lowest=True
)

# Calculate wonky rates by quality bin
quality_bin_summary = (
    user_info_df.groupby('quality_bin')
    .agg({
        'wonky_group': ['sum', 'count', 'mean'],
        'quality': 'mean'
    })
    .reset_index()
)

quality_bin_summary.columns = ['quality_bin', 'wonky_count', 'total_count', 'wonky_rate', 'avg_quality']
quality_bin_summary['wonky_rate_pct'] = quality_bin_summary['wonky_rate'] * 100
quality_bin_summary['non_wonky_count'] = quality_bin_summary['total_count'] - quality_bin_summary['wonky_count']

print("=" * 80)
print("QUALITY BIN ANALYSIS: Wonky Rates by Quality Level")
print("=" * 80)
display(quality_bin_summary[['quality_bin', 'total_count', 'wonky_count', 'non_wonky_count', 
                              'wonky_rate_pct', 'avg_quality']])


In [0]:
# Chi-square test on quality bins
chi_square_results_quality_bins = perform_chi_square_tests(
    user_info_df,
    feature_set=['quality_bin'],
    group_var='wonky_study_count',
    significance_level=0.01
)

print("\nChi-Square Test Results (Quality Bins):")
print("-" * 80)
display(chi_square_results_quality_bins.reset_index())

# Two-proportion z-tests for each bin vs overall
print("\nTwo-Proportion Z-Tests (Each Bin vs Overall):")
print("-" * 80)

# Create binary features for each bin
for bin_label in ['Low (0-50)', 'Medium (51-75)', 'High (76-90)', 'Very High (91-100)']:
    user_info_df[f'is_{bin_label.replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_")}'] = (
        user_info_df['quality_bin'] == bin_label
    ).astype(int)

quality_bin_features = [
    'is_Low_0_50',
    'is_Medium_51_75',
    'is_High_76_90',
    'is_Very_High_91_100'
]

ztest_results_quality_bins = perform_two_proportion_z_tests(
    user_info_df,
    feature_set=quality_bin_features,
    group_var='wonky_study_count',
    significance_level=0.01
)

display(ztest_results_quality_bins.reset_index())


In [0]:
# Analyze wonky rates at different quality thresholds
quality_thresholds = [0, 25, 50, 60, 70, 75, 80, 85, 90, 95, 100]
threshold_results = []

for threshold in quality_thresholds:
    if threshold == 100:
        mask = user_info_df['quality'] == threshold
        label = f"Quality = {threshold}"
    else:
        mask = user_info_df['quality'] <= threshold
        label = f"Quality ≤ {threshold}"
    
    if mask.sum() > 0:
        total = mask.sum()
        wonky_count = user_info_df.loc[mask, 'wonky_group'].sum()
        wonky_rate = wonky_count / total if total > 0 else 0
        avg_quality = user_info_df.loc[mask, 'quality'].mean()
        
        threshold_results.append({
            'threshold': label,
            'total_tasks': total,
            'wonky_count': wonky_count,
            'wonky_rate': wonky_rate,
            'wonky_rate_pct': wonky_rate * 100,
            'avg_quality': avg_quality
        })

threshold_df = pd.DataFrame(threshold_results)

print("=" * 80)
print("QUALITY THRESHOLD ANALYSIS: Wonky Rates at Different Quality Levels")
print("=" * 80)
display(threshold_df)


In [0]:
# Visualize wonky rate by quality threshold
fig_threshold = px.bar(
    threshold_df,
    x='threshold',
    y='wonky_rate_pct',
    title='Wonky Rate by Quality Threshold',
    labels={'wonky_rate_pct': 'Wonky Rate (%)', 'threshold': 'Quality Threshold'},
    color='wonky_rate_pct',
    color_continuous_scale='Reds'
)
fig_threshold.update_layout(
    xaxis_title="Quality Threshold",
    yaxis_title="Wonky Rate (%)",
    showlegend=False
)
fig_threshold.show()


In [0]:
# Analyze quality distribution for wonky vs non-wonky
# Create more granular bins for detailed analysis
user_info_df['quality_bin_detailed'] = pd.cut(
    user_info_df['quality'],
    bins=[0, 30, 50, 70, 85, 95, 100],
    labels=['Very Low (0-30)', 'Low (31-50)', 'Medium (51-70)', 
            'High (71-85)', 'Very High (86-95)', 'Excellent (96-100)'],
    include_lowest=True
)

detailed_bin_summary = (
    user_info_df.groupby(['quality_bin_detailed', 'wonky_group'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

if 0 in detailed_bin_summary.columns and 1 in detailed_bin_summary.columns:
    detailed_bin_summary['total'] = detailed_bin_summary[0] + detailed_bin_summary[1]
    detailed_bin_summary['wonky_rate_pct'] = (detailed_bin_summary[1] / detailed_bin_summary['total'] * 100).round(2)
    detailed_bin_summary['non_wonky_count'] = detailed_bin_summary[0]
    detailed_bin_summary['wonky_count'] = detailed_bin_summary[1]
    
    print("=" * 80)
    print("DETAILED QUALITY BIN ANALYSIS")
    print("=" * 80)
    display(detailed_bin_summary[['quality_bin_detailed', 'non_wonky_count', 'wonky_count', 
                                   'total', 'wonky_rate_pct']].sort_values('wonky_rate_pct', ascending=False))


In [0]:
user_info_df['risk']

In [0]:
# series = user_info_df['quality']

# hardware_dummies = (
#     series.explode()                    
#      .str.strip()                   
#      .pipe(pd.get_dummies)          
#      .groupby(level=0).sum()     
# )

# hardware_cols = hardware_dummies.columns

# user_info_df = user_info_df.join(hardware_dummies)

#### Risk & Quality

#### Exposure bands

In [0]:
wonky_map

In [0]:
user_info_df['exposure_band'].value_counts()/len(user_info_df)

In [0]:
wonky_respondent_df['exposure_band'].value_counts()/len(wonky_respondent_df)

In [0]:
respondent_features

### Respondent_Features

In [0]:
# user_info_df['exposure_band'] = user_info_df['exposure_band'].fillna('unknown')

weak correlations across the board some strong ones device related but nothing huge to go by.

Next stage run analysis on 3 months of data 

In [0]:
# respondent_features_path = os.path.join(misc_dir,
#                           os.path.basename(paths_config['output_files'].get('respondent_features')))

# respondent_features.to_parquet(respondent_features_path, index=False)