In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_task_temporal_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score,
    add_rating_delta
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_feature_breakdown_table,
    calculate_temporal_feature_deltas,       
)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("âœ“ Imports and configs loaded successfully")


In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))
wonky_map_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_map']))

user_info_df = pd.read_parquet(output_path) 
wonky_counts = pd.read_parquet(wonky_counts_path) 
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) 
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path)
wonky_map = pd.read_parquet(wonky_map_path)

In [0]:
# [col for col in user_info_df.columns if 'task' in col]

In [0]:
# pk = '1b6b0277-9f8c-4b7b-be12-28c21c2ce17b'

# user_info_df[(user_info_df['respondentPk'] == pk) & (user_info_df['wonky_study_count'] > 0)][['respondentPk', 'date_completed', 'wonky_study_count', 'totaal_tasks_completed']]

In [0]:
# wonky_map[wonky_map['respondent_pk']  == pk].display()

# user_info_df[(user_info_df['respondentPk']  == pk)][['date_completed', 'wonky_study_count']].display()

# special_user = user_info_df[user_info_df['respondentPk']  == pk]
# special_user['date_completed'] = pd.to_datetime(special_user['date_completed'], format='%Y-%m-%d').dt.date

# special_user_chartdata = special_user[['date_completed', 'taskPk']].groupby('date_completed').count().reset_index()

# fig = px.line(
#     special_user_chartdata,
#     x="date_completed",
#     y="taskPk",
#     markers=True  # optional: show markers on each date
# )

# fig.update_layout(
#     xaxis_title="Date completed",
#     yaxis_title="Number of tasks",
# )

# fig.show()

In [0]:
# [col for col in user_info_df.columns if 'days' in col]

In [0]:
# user_info_df[user_info_df["wonky_study_count"] > 0]

In [0]:
### days active

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
# series = user_info_df['days_active_before_task']

# daysactive_dummies = (
#     series.explode()                    
#      .astype(str).str.strip()                   
#      .pipe(pd.get_dummies)         
#      .groupby(level=0).sum()     
# )

# bins = [2, 7, 14, 21, 28, 31, 50]
# for limit in bins:
#   print(f'days_active created_<{limit}')
#   daysactive_dummies[f'<{limit}'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) < limit]].sum(axis=1)

# daysactive_dummies[f'>=_50'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) >= 50]].sum(axis=1)

# daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

# daysactive_cols = daysactive_dummies.columns

# user_info_df = user_info_df.join(daysactive_dummies)

In [0]:
user_info_df['days_active_before_task']


In [0]:
series = user_info_df['days_active_before_task']

bins = [2, 7, 14, 21, 28, 31, 50]
bin_names = [f'days_active_<{b}' for b in bins] + ['days_active_>=_50']

n_rows = len(series)
n_bins = len(bin_names)
result = np.zeros((n_rows, n_bins), dtype=np.int32)

for idx, days_list in enumerate(series):
    if isinstance(days_list, (list, np.ndarray)) and len(days_list) > 0:
        days_array = np.array(days_list, dtype=np.int32)
        
        for i, limit in enumerate(bins):
            result[idx, i] = np.sum(days_array < limit)
        
        result[idx, -1] = np.sum(days_array >= 50)

daysactive_dummies = pd.DataFrame(result, columns=bin_names, index=user_info_df.index)

user_info_df = user_info_df.join(daysactive_dummies)

In [0]:
(pd.DataFrame(user_info_df["days_active_before_task"].value_counts()).reset_index().sort_values('index')['days_active_before_task'].cumsum()/len(user_info_df)).head(10)

In [0]:
(user_info_df["days_active_before_task"].value_counts().cumsum() / len(user_info_df)).head(10)

In [0]:
vc_all = (
    user_info_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(user_info_df) * 100
)

df_all = vc_all.reset_index()
df_all.columns = ["days_active_before_task", "percent"]
df_all["group"] = "All users"

wonky_df = user_info_df[user_info_df["wonky_study_count"] > 0]

vc_wonky = (
    wonky_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(wonky_df) * 100
)

df_wonky = vc_wonky.reset_index()
df_wonky.columns = ["days_active_before_task", "percent"]
df_wonky["group"] = "Wonky users"

plot_df = pd.concat([df_all, df_wonky], ignore_index=True)

pivot_df = plot_df.pivot(index="days_active_before_task", columns="group", values="percent").fillna(0)
pivot_df["delta"] = pivot_df["Wonky users"] - pivot_df["All users"]

df_delta = pivot_df["delta"].reset_index()
df_delta.columns = ["days_active_before_task", "percent"]
df_delta["group"] = "Delta (Wonky - All)"

plot_df_with_delta = pd.concat([plot_df, df_delta], ignore_index=True)

fig = px.line(
    plot_df_with_delta,
    x="days_active_before_task",
    y="percent",
    color="group",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Users (%) / Delta (pp)",
    template="plotly_white",
    legend_title_text="Group",
)

fig.show()

In [0]:
df = user_info_df[~user_info_df['exposure_band'].isna()].copy()

vc_all = (
    df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(df) * 100
)

df_all = vc_all.reset_index()
df_all.columns = ["days_active_before_task", "percent"]
df_all["group"] = "All users"

wonky_df = df[df["wonky_study_count"] > 0]

vc_wonky = (
    wonky_df["days_active_before_task"]
    .value_counts()
    .sort_index() / len(wonky_df) * 100
)

df_wonky = vc_wonky.reset_index()
df_wonky.columns = ["days_active_before_task", "percent"]
df_wonky["group"] = "Wonky users"

plot_df = pd.concat([df_all, df_wonky], ignore_index=True)

pivot_df = plot_df.pivot(index="days_active_before_task", columns="group", values="percent").fillna(0)
pivot_df["delta"] = pivot_df["Wonky users"] - pivot_df["All users"]

df_delta = pivot_df["delta"].reset_index()
df_delta.columns = ["days_active_before_task", "percent"]
df_delta["group"] = "Delta (Wonky - All)"

plot_df_with_delta = pd.concat([plot_df, df_delta], ignore_index=True)

fig = px.line(
    plot_df_with_delta,
    x="days_active_before_task",
    y="percent",
    color="group",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Users (%) / Delta (pp)",
    template="plotly_white",
    legend_title_text="Group",
)

fig.show()

In [0]:
(pd.DataFrame(df["days_active_before_task"].value_counts()).reset_index().sort_values('index').set_index('index')['days_active_before_task'].cumsum()/len(df)).head(40)

In [0]:
df['user_type'] = df['wonky_study_count'].apply(
    lambda x: 'Wonky' if x > 0 else 'All'
)

plot_data = []

grouped = df.groupby(['user_type', 'exposure_band'])

for (user_type, exposure), group_df in grouped:
    vc = (
        group_df["days_active_before_task"]
        .value_counts()
        .sort_index() / len(group_df) * 100
    )
    
    df_temp = vc.reset_index()
    df_temp.columns = ["days_active_before_task", "percent"]
    df_temp["group"] = f"{user_type} - {exposure.capitalize()}"
    plot_data.append(df_temp)

plot_df = pd.concat(plot_data, ignore_index=True)

fig = px.line(
    plot_df,
    x="days_active_before_task",
    y="percent",
    color="group",
    markers=True,
)

fig.update_layout(
    xaxis_title="Days active before task",
    yaxis_title="Users (%)",
    template="plotly_white",
    legend_title_text="Group",
)

fig.show()