In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_task_temporal_features,
    create_all_temporal_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score,
    add_rating_delta
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_feature_breakdown_table,
    calculate_temporal_feature_deltas,       
)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))
wonky_map_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_map']))

user_info_df = pd.read_parquet(output_path) 
wonky_counts = pd.read_parquet(wonky_counts_path) 
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) 
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path)
wonky_map = pd.read_parquet(wonky_map_path)

In [0]:
wonky_map

In [0]:
user_info_df.shape

In [0]:
(len(user_info_df[user_info_df['wonky_study_count'] == 1]) / len(user_info_df)) * 100

In [0]:
len(user_info_df[user_info_df['wonky_study_count'] == 1])

In [0]:
user_info_df = user_info_df[~user_info_df['exposure_band'].isna()]

In [0]:
user_info_df.shape

In [0]:
user_info_df[~user_info_df['exposure_band'].isna()]['date_completed'].min()

In [0]:
user_info_df.shape

#### task labels

In [0]:
sorted(user_info_df.columns)

In [0]:
user_info_df, task_labels = create_task_amount_features(user_info_df, 'totaal_tasks_completed')

In [0]:
user_info_df

#### days active - grouped

In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
series = user_info_df['days_active_before_task']

daysactive_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

bins = [2, 7, 14, 21, 28, 31, 50]
for limit in bins:
  print(f'days_active created_<{limit}')
  daysactive_dummies[f'<{limit}'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) < limit]].sum(axis=1)

daysactive_dummies[f'>=_50'] = daysactive_dummies[[col for col in daysactive_dummies.columns if col.isdigit() and int(col) >= 50]].sum(axis=1)

daysactive_dummies = daysactive_dummies.add_prefix('days_active_')

daysactive_cols = daysactive_dummies.columns

user_info_df = user_info_df.join(daysactive_dummies)

In [0]:
sorted(daysactive_dummies.columns)

In [0]:
user_info_df.shape

#### Entry Point

In [0]:
series = user_info_df['entryPoint']

EP_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

EP_dummies = EP_dummies.add_prefix('entryPoint_')

EP_cols = EP_dummies.columns

user_info_df = user_info_df.join(EP_dummies)

In [0]:
EP_dummies.columns

In [0]:
user_info_df.shape

#### Email verification

In [0]:
series = user_info_df['email_verified']

email_veri_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

email_veri_dummies = email_veri_dummies.add_prefix('email_verified_')

email_veri_cols = email_veri_dummies.columns

user_info_df = user_info_df.join(email_veri_dummies)

In [0]:
email_veri_dummies.columns

In [0]:
user_info_df.shape

#### notify task pay out

In [0]:
series = user_info_df['notify_task_payout'].astype(str)

notify_task_payout_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

notify_task_payout_dummies = notify_task_payout_dummies.add_prefix('notify_task_payout_')

notify_task_payout_cols = notify_task_payout_dummies.columns

user_info_df = user_info_df.join(notify_task_payout_dummies)

In [0]:
notify_task_payout_dummies.columns

In [0]:
user_info_df.shape

#### notify new task

In [0]:
series = user_info_df['notify_new_task']

notify_new_task_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

notify_new_task_dummies = notify_new_task_dummies.add_prefix('notify_new_task_')

notify_new_task_cols = notify_new_task_dummies.columns

user_info_df = user_info_df.join(notify_new_task_dummies)

In [0]:
notify_new_task_dummies.columns

In [0]:
user_info_df.shape

#### share location data

In [0]:

series = user_info_df['share_location_data']

share_location_data_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)

share_location_data_dummies = share_location_data_dummies.add_prefix('share_location_data_')

share_location_data_cols = share_location_data_dummies.columns

user_info_df = user_info_df.join(share_location_data_dummies)

In [0]:
share_location_data_dummies.columns

In [0]:
user_info_df.shape

#### Temporal features

In [0]:
# Create time features using modular function
user_info_df = create_task_temporal_features(user_info_df, date_col="date_completed")

print(f"Night tasks: {user_info_df['is_night'].mean()*100:.1f}%")
print(f"Weekend tasks: {user_info_df['is_weekend'].mean()*100:.1f}%")

In [0]:
df_full = create_all_temporal_features(
    user_info_df, 
    date_col="date_completed",
    include_hourly=True
)

temporal_features = [col for col in df_full.columns if any([
    col.startswith('is_'),
    col == 'hour_of_day',
    col == 'day_of_week',
    col == 'hour_period',
    col == 'hour_of_day_label'
])]

print(f"Created {len(temporal_features)} temporal features:")
for feature in sorted(temporal_features):
    print(f"  - {feature}")

print("\n" + "="*70)
print("Task Completion Distribution by Hour:")
print("="*70)
hour_dist = df_full['hour_of_day_label'].value_counts().sort_index()
print(hour_dist)

print("\n" + "="*70)
print("Sample Data with Hour Labels:")
print("="*70)
print(df_full[['date_completed', 'hour_of_day', 'hour_of_day_label', 'hour_period']].head(10))


In [0]:
user_info_df.shape

#### Task speeds

In [0]:
user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

In [0]:
user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

In [0]:
user_info_df.shape

#### Devices

In [0]:
series = user_info_df['hardware_version']

hardware_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)         
     .groupby(level=0).sum()     
)
hardware_cols = hardware_dummies.columns

user_info_df = user_info_df.join(hardware_dummies)

In [0]:
hardware_dummies.columns

In [0]:
user_info_df.shape

#### Platform

In [0]:
series = user_info_df['platform_name']

platform_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

platform_cols = platform_dummies.columns

user_info_df = user_info_df.join(platform_dummies)

In [0]:
platform_dummies.columns

In [0]:
user_info_df.shape

#### Gambling

In [0]:
series = user_info_df['gambling_participation_mc']

# One-hot encode each gambling mode
gambling_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gambling_cols = gambling_dummies.columns

user_info_df = user_info_df.join(gambling_dummies)

In [0]:
gambling_dummies.columns

In [0]:
user_info_df.shape

#### Income

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df["fulcrum_household_income_mapped"] = (
    user_info_df["fulcrum_household_income"].map(income_map)
)

user_info_df["fulcrum_household_income_mapped"].value_counts()/len(user_info_df)

In [0]:
series = user_info_df['fulcrum_household_income_mapped']

# One-hot encode each gambling mode
income_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

income_cols = income_dummies.columns
user_info_df = user_info_df.join(income_dummies)

In [0]:
sorted(income_dummies.columns)

In [0]:
user_info_df.shape

#### Risk Vars

In [0]:
series = user_info_df['risk'].astype(str)

# One-hot encode each gambling mode
risk_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

risk_dummies = risk_dummies.add_prefix('risk_')

risk_cols = risk_dummies.columns

user_info_df = user_info_df.join(risk_dummies)

In [0]:
risk_dummies.columns

In [0]:
user_info_df['risk=100'] = np.where(user_info_df['risk'] == 100, 1, 0)
user_info_df['risk<90'] = np.where(user_info_df['risk'] < 90, 1, 0)
user_info_df['risk<80'] = np.where(user_info_df['risk'] < 80, 1, 0)
user_info_df['risk<50'] = np.where(user_info_df['risk'] < 50, 1, 0)

In [0]:
user_info_df = add_rating_delta(
    df=user_info_df,
    feature='risk',
    delta_period='max'
)

print(f'new array of unique deltas created: {user_info_df["risk_delta"].unique()}')

user_info_df['risk_delta_LargePostive'] = np.where(user_info_df['risk_delta'] > 50, 1, 0)
user_info_df['risk_delta_Postive'] = np.where(user_info_df['risk_delta'] > 0, 1, 0)
user_info_df['risk_delta_Neutral'] = np.where(user_info_df['risk_delta'] ==  0, 1, 0)
user_info_df['risk_delta_LargeNegative'] = np.where(user_info_df['risk_delta'] < -50, 1, 0)
user_info_df['risk_delta_Negative'] = np.where(user_info_df['risk_delta'] < 0, 1, 0)

In [0]:
user_info_df.shape

#### Gender

In [0]:
series = user_info_df['gender']

# One-hot encode each gambling mode
gender_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

gender_dummies = gender_dummies.add_prefix('gender_')

gender_cols = gender_dummies.columns

user_info_df = user_info_df.join(gender_dummies)

In [0]:
gender_dummies.columns

In [0]:
user_info_df.shape

#### Exposure Bands

In [0]:
sorted(user_info_df.columns)

In [0]:
series = user_info_df['exposure_band']

# One-hot encode each gambling mode
exposure_dummies = (
    series.explode()                    
     .str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

exposure_dummies = exposure_dummies.add_prefix('exposure_')

exposure_cols = exposure_dummies.columns

user_info_df = user_info_df.join(exposure_dummies)

In [0]:
exposure_dummies.columns

In [0]:
user_info_df.shape

#### Quality Vars

In [0]:
series = user_info_df['quality']

# One-hot encode each gambling mode
quality_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

quality_dummies = quality_dummies.add_prefix('quality_')

quality_cols = quality_dummies.columns

user_info_df = user_info_df.join(quality_dummies)

In [0]:
quality_dummies.columns

In [0]:
user_info_df["quality=100"] = np.where(user_info_df["quality"] == 100, 1, 0)
user_info_df["quality<90"] = np.where(user_info_df["quality"] < 90, 1, 0)
user_info_df["quality<75"] = np.where(user_info_df["quality"] < 75, 1, 0)
user_info_df["quality<50"] = np.where(user_info_df["quality"] < 50, 1, 0)
user_info_df["quality<30"] = np.where(user_info_df["quality"] < 30, 1, 0)

In [0]:
user_info_df = add_rating_delta(
    df=user_info_df,
    feature='quality',
    delta_period=1
)

print(f'new array of unique deltas created: {user_info_df["quality_delta"].unique()}')

user_info_df['quality_delta_LargePostive'] = np.where(user_info_df['quality_delta'] > 50, 1, 0)
user_info_df['quality_delta_Postive'] = np.where(user_info_df['quality_delta'] > 0, 1, 0)
user_info_df['quality_delta_Neutral'] = np.where(user_info_df['quality_delta'] ==  0, 1, 0)
user_info_df['quality_delta_LargeNegative'] = np.where(user_info_df['quality_delta'] < -50, 1, 0)
user_info_df['quality_delta_Negative'] = np.where(user_info_df['quality_delta'] < 0, 1, 0)

In [0]:
user_info_df.shape

#### Multiple YOBs

In [0]:
# 97.5% of balance table since september have the same YOBS

# user_info_df = user_info_df.merge(
#     pd.DataFrame(
#         user_info_df[["respondentPk", "YOB"]]
#         .groupby("respondentPk")
#         .count()
#     ).rename(columns={"YOB": "YOB_count"}),
#     on="respondentPk",
#     how="left",
# )

In [0]:
###. YOBS A NON STARTER

# series = user_info_df['YOB_count']

# # One-hot encode each gambling mode
# YOB_dummies = (
#     series.explode()                    
#      .str.strip()                   
#      .pipe(pd.get_dummies)          
#      .groupby(level=0).sum()     
# )

# YOB_dummies = YOB_dummies.add_prefix('exposure_')

# YOB_cols = YOB_dummies.columns

# user_info_df = user_info_df.join(YOB_dummies)

#### IP Addresses

In [0]:
# Count unique IP addresses per respondentPk and merge back
ip_counts = user_info_df.groupby('respondentPk')['request-remote-addr'].nunique().reset_index()
ip_counts.columns = ['respondentPk', 'ip_address_count']

# Merge back to main dataframe
user_info_df = user_info_df.merge(ip_counts, on='respondentPk', how='left')

# Summary stats
print(f"Total respondents: {user_info_df['respondentPk'].nunique():,}")
print(f"Respondents with 1 IP: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() == 1).sum():,}")
print(f"Respondents with 2+ IPs: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() > 1).sum():,}")

In [0]:
series = user_info_df['ip_address_count']

# One-hot encode each gambling mode
ip_dummies = (
    series.explode()                    
     .astype(str).str.strip()                   
     .pipe(pd.get_dummies)          
     .groupby(level=0).sum()     
)

ip_dummies = ip_dummies.add_prefix('ip_addr_count_')

ip_cols = ip_dummies.columns

user_info_df = user_info_df.join(ip_dummies)

In [0]:
user_info_df

In [0]:
user_info_POSTEDA_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('user_info_df_post_eda')))

user_info_df.to_parquet(user_info_POSTEDA_path, index=False)