In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_all_temporal_features,
    create_task_speed_features,
    )

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_feature_breakdown_table,
    calculate_temporal_feature_deltas,       
)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))
wonky_counts_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_user_counts']))
wonky_respondent_df_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_df']))
wonky_respondent_summary_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_respondent_summary']))
wonky_map_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files']['wonky_map']))

user_info_df = pd.read_parquet(output_path) 
wonky_counts = pd.read_parquet(wonky_counts_path) 
wonky_respondent_df = pd.read_parquet(wonky_respondent_df_path) 
wonky_respondent_summary = pd.read_parquet(wonky_respondent_summary_path)
wonky_map = pd.read_parquet(wonky_map_path)

In [0]:
wonky_map

In [0]:
user_info_df.shape

In [0]:
(len(user_info_df[user_info_df['wonky_study_count'] == 1]) / len(user_info_df)) * 100

In [0]:
len(user_info_df[user_info_df['wonky_study_count'] == 1])

In [0]:
user_info_df = user_info_df[~user_info_df['exposure_band'].isna()]

In [0]:
user_info_df.shape

In [0]:
user_info_df[~user_info_df['exposure_band'].isna()]['date_completed'].min()

In [0]:
user_info_df.shape

In [0]:
sorted(user_info_df.columns)

In [0]:
user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.9999), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.9999))

user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

In [0]:
user_info_df = create_all_temporal_features(
    user_info_df, 
    date_col="date_completed",
    include_hourly=True
)

temporal_features = [col for col in user_info_df.columns if any([
    col.startswith('is_'),
    col == 'hour_of_day',
    col == 'day_of_week',
    col == 'hour_period',
    col == 'hour_of_day_label'
])]

print(f"Created {len(temporal_features)} temporal features:")
for feature in sorted(temporal_features):
    print(f"  - {feature}")

print("\n" + "="*70)
print("Task Completion Distribution by Hour:")
print("="*70)
hour_dist = user_info_df['hour_of_day_label'].value_counts().sort_index()
print(hour_dist)

print("\n" + "="*70)
print("Sample Data with Hour Labels:")
print("="*70)
print(user_info_df[['date_completed', 'hour_of_day', 'hour_of_day_label', 'hour_period']].head(10))


#### batch one-hot encoding

#### ratings Threshold

In [0]:
from feature_engineering_utils import create_threshold_features, create_score_features

user_info_df = create_threshold_features(
    user_info_df, 'quality',
    thresholds=[
        ('quality=100', '==', 100),
        ('quality<90', '<', 90),
        ('quality<75', '<', 75),
        ('quality<50', '<', 50),
        ('quality<30', '<', 30),
    ]
)

user_info_df = create_score_features(user_info_df, 'quality')
user_info_df = create_score_features(user_info_df, 'risk', thresholds=[90, 80, 50])

#### binary days active


In [0]:
min_dates = user_info_df[['respondentPk', 'date_completed']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_completed': 'first_task_completed_date'})

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_completed_date"]
).dt.days

In [0]:
from feature_engineering_utils import create_binned_features

user_info_df, days_active_cols = create_binned_features(
    user_info_df, 
    'days_active_before_task',
    bins=[2, 10, 20, 30, 50, 75, 100, 125, 150, 200, 250],
    prefix='days_active_'
)

#### mapping

In [0]:
from feature_engineering_utils import create_value_mapping_and_encode

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df, income_cols = create_value_mapping_and_encode(
    user_info_df, 
    'fulcrum_household_income',
    income_map=income_map
)

#### task labels

In [0]:
user_info_df, task_labels = create_task_amount_features(user_info_df, 'totaal_tasks_completed')

#### ip addresses

In [0]:
# Count unique IP addresses per respondentPk and merge back
ip_counts = user_info_df.groupby('respondentPk')['request-remote-addr'].nunique().reset_index()
ip_counts.columns = ['respondentPk', 'ip_address_count']

# Merge back to main dataframe
user_info_df = user_info_df.merge(ip_counts, on='respondentPk', how='left')

# Summary stats
print(f"Total respondents: {user_info_df['respondentPk'].nunique():,}")
print(f"Respondents with 1 IP: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() == 1).sum():,}")
print(f"Respondents with 2+ IPs: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() > 1).sum():,}")

#### encoding

In [0]:
from feature_engineering_utils import one_hot_encode_column, batch_one_hot_encode

str_vars_to_encode = [
    "notify_task_payout",
    "exposure_band",
    "gender",
    "title",
    "entryPoint",
    "hardware_version",
    "gambling_participation_mc",
]
nonstr_vars_to_encode = [
    "quality",
    "risk",
    "task_length_of_task",
    "notify_task_payout",
    "share_location_data",
    "notify_new_task",
    "email_verified",
    "ip_address_count",
]

user_info_df = batch_one_hot_encode(
    user_info_df,
    columns=str_vars_to_encode,
    as_string_columns=nonstr_vars_to_encode
)

In [0]:
user_info_POSTEDA_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('user_info_df_post_eda')))

user_info_df.to_parquet(user_info_POSTEDA_path, index=False)