In [0]:
# Import libraries
import sys
import os

sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_task_amount_features,
    create_all_temporal_features,
    create_task_speed_features,
    )

from eda.feature_engineering_utils import (
    create_binned_features,
    create_value_mapping_and_encode,
    one_hot_encode_column,
    batch_one_hot_encode,
    create_threshold_features,
    create_score_features,
    filter_to_engineered_features,
    add_device_categories,        # NEW
    get_device_mapping_summary,   # NEW (optional, for validation)
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_feature_breakdown_table,
    calculate_temporal_feature_deltas,       
)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


### Load

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

output_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df']))

user_info_df = pd.read_parquet(output_path) 

In [0]:
cols_to_drop_from_load = [
    "alcohol_frequency",
    "appVersion",
    "comscore_email",
    "consent3rd_party_cookies",
    "consent_mobile_device_identifiers",
    "consent_pp_tandc",
    "deleted",
    "driver_license",
    "electronics_ownership",
    "electronics_ownership_mc",
    "email_updates",
    "enabled",
    "fine_location",
    "fraud_lock",
    "locale",
    "loi",
    "main_email",
    "main_email_verified",
    "sms_updates",
    "system_group_id",
    "u_date_created",
    "u_last_updated",
    "uninstalled",
    "up_last_updated",
    "up_version",
    "us_date_created",
    "us_last_updated",
    "us_version"
]

In [0]:
user_info_df = user_info_df.drop(columns=cols_to_drop_from_load)

In [0]:
original_columns = set(user_info_df.columns)

In [0]:
user_info_df['task_time_taken_s_capped'] = np.where(user_info_df['task_time_taken_s'] < user_info_df['task_time_taken_s'].quantile(0.95), user_info_df['task_time_taken_s'], user_info_df['task_time_taken_s'].quantile(0.95))

user_info_df = create_task_speed_features(
    user_info_df,
    task_time_col="task_time_taken_s",
    use_std_dev=True
)

In [0]:
user_info_df = create_all_temporal_features(
    user_info_df, 
    date_col="date_completed",
    include_hourly=True
)

temporal_features = [col for col in user_info_df.columns if any([
    col.startswith('is_'),
    col == 'hour_of_day',
    col == 'day_of_week',
    col == 'hour_period',
    col == 'hour_of_day_label'
])]

print(f"Created {len(temporal_features)} temporal features:")
for feature in sorted(temporal_features):
    print(f"  - {feature}")

print("\n" + "="*70)
print("Task Completion Distribution by Hour:")
print("="*70)
hour_dist = user_info_df['hour_of_day_label'].value_counts().sort_index()
print(hour_dist)

print("\n" + "="*70)
print("Sample Data with Hour Labels:")
print("="*70)
print(user_info_df[['date_completed', 'hour_of_day', 'hour_of_day_label', 'hour_period']].head(10))


In [0]:
user_info_df = create_score_features(user_info_df, 'quality')
user_info_df = create_score_features(user_info_df, 'risk')

In [0]:
[col for col in user_info_df.columns if 'date_' in col]

In [0]:
min_dates = user_info_df[['respondentPk', 'date_created']].groupby('respondentPk').min().reset_index()
min_dates = min_dates.rename(columns={'date_created': 'first_task_created'})

In [0]:
min_dates

In [0]:
user_info_df = user_info_df.merge(min_dates, on="respondentPk", how="left")
user_info_df["days_active_before_task"] = (
    user_info_df["date_completed"] - user_info_df["first_task_created"]
).dt.days

In [0]:
user_info_df, days_active_cols = create_binned_features(
    user_info_df, 
    'days_active_before_task',
    bins=[2, 10, 20, 30, 50, 75, 100, 125, 150, 200, 250],
    prefix='days_active_',
    include_mutually_exclusive=True
)

In [0]:
income_map = {
    "A": "Less than £15,000",
    "B": "£15,000 to £19,999",
    "C": "£20,000 to £24,999",
    "D": "£25,000 to £29,999",
    "E": "£30,000 to £34,999",
    "F": "£35,000 to £39,999",
    "G": "£40,000 to £44,999",
    "H": "£45,000 to £49,999",
    "I": "£50,000 to £59,999",
    "J": "£60,000 to £74,999",
    "K": "£75,000 to £84,999",
    "L": "£85,000 to £99,999",
    "M": "£100,000 to £124,999",
    "N": "£125,000 to £149,999",
    "O": "£150,000 to £174,999",
    "P": "£175,000 to £199,999",
    "Q": "£200,000 and above",
    "R": "Prefer not to answer",
}

user_info_df, income_cols = create_value_mapping_and_encode(
    user_info_df, 
    'fulcrum_household_income',
    income_map
)

In [0]:
industry_map = {
    "A": "Accounting",
    "B": "Advertising",
    "C": "Agriculture/Fishing",
    "D": "Architecture",
    "E": "Automotive",
    "F": "Aviation",
    "G": "Banking/Financial",
    "H": "Bio-Tech",
    "I": "Brokerage",
    "J": "Carpenting/Electrical installations/VVS",
    "K": "Chemicals/Plastics/Rubber",
    "L": "Communications/Information",
    "M": "Computer Hardware",
    "N": "Computer Reseller (software/hardware)",
    "O": "Computer Software",
    "P": "Construction",
    "Q": "Consulting",
    "R": "Consumer Electronics",
    "S": "Consumer Packaged Goods",
    "T": "Education",
    "U": "Energy/Utilities/Oil and Gas",
    "V": "Engineering",
    "W": "Environmental Services",
    "X": "Fashion/Apparel",
    "Y": "Food/Beverage",
    "Z": "Government/Public Sector",
    "AA": "Healthcare",
    "AB": "Hospitality/Tourism",
    "AC": "Human Resources",
    "AD": "Information Technology/IT",
    "AE": "Insurance",
    "AF": "Internet",
    "AG": "Legal/Law",
    "AH": "Manufacturing",
    "AI": "Market Research",
    "AJ": "Marketing/Sales",
    "AK": "Media/Entertainment",
    "AL": "Military",
    "AM": "Non Profit/Social services",
    "AN": "Personal Services",
    "AO": "Pharmaceuticals",
    "AP": "Printing Publishing",
    "AQ": "Public Relations",
    "AR": "Real Estate/Property",
    "AS": "Retail/Wholesale trade",
    "AT": "Security",
    "AU": "Shipping/Distribution",
    "AV": "Telecommunications",
    "AW": "Transportation",
    "AX": "Other",
    "AY": "I don't work",
}

user_info_df, industry_cols = create_value_mapping_and_encode(
    user_info_df, 
    'employer_industry',
    industry_map
)

In [0]:
industry_cols

In [0]:
department_map = {
    "A": "Administration/General Staff",
    "B": "Customer Service/Client Service",
    "C": "Executive Leadership",
    "D": "Finance/Accounting",
    "E": "Human Resources",
    "F": "Legal/Law",
    "G": "Marketing",
    "H": "Operations",
    "I": "Procurement",
    "J": "Sales/Business Development",
    "K": "Technology Development Hardware (not only IT)",
    "L": "Technology Development Software (not only IT)",
    "M": "Technology Implementation",
    "N": "Other",
    "O": "I don't work",
}

user_info_df, department_cols = create_value_mapping_and_encode(
    user_info_df, 
    'employer_department',
    department_map
)

In [0]:
department_cols

In [0]:
region_map = {
    "A": "East Midlands (England)",
    "B": "East Of England",
    "C": "London",
    "D": "North East (England)",
    "E": "North West (England)",
    "F": "Northern Ireland",
    "G": "Scotland",
    "H": "South East (England)",
    "I": "South West (England)",
    "J": "Wales",
    "K": "West Midlands (England)",
    "L": "Yorkshire And The Humber",
}

user_info_df, region_cols = create_value_mapping_and_encode(
    user_info_df, 
    'fulcrum_region',
    region_map
)

In [0]:
region_cols

In [0]:
user_info_df, task_labels = create_task_amount_features(user_info_df, 'task_completed')

In [0]:
sorted(user_info_df.columns)

In [0]:
# Count unique IP addresses per respondentPk and merge back
ip_counts = user_info_df.groupby('respondentPk')['request-remote-addr'].nunique().reset_index()
ip_counts.columns = ['respondentPk', 'ip_address_count']

# Merge back to main dataframe
user_info_df = user_info_df.merge(ip_counts, on='respondentPk', how='left')

# Summary stats
print(f"Total respondents: {user_info_df['respondentPk'].nunique():,}")
print(f"Respondents with 1 IP: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() == 1).sum():,}")
print(f"Respondents with 2+ IPs: {(user_info_df.groupby('respondentPk')['ip_address_count'].first() > 1).sum():,}")

In [0]:
user_info_df = add_device_categories(user_info_df)

print(f"Hardware categories: {user_info_df['hardware_category'].nunique()}")
print(user_info_df['hardware_category'].value_counts())

print(f"\nManufacturer categories: {user_info_df['manufacturer_category'].nunique()}")
print(user_info_df['manufacturer_category'].value_counts())

In [0]:
user_info_df['risk_round10'] = user_info_df['risk'].round(-1).astype(int)
user_info_df['quality_round10'] = user_info_df['quality'].round(-1).astype(int)

In [0]:
str_vars_to_encode = [
    "device",
    "hardware_category", # from ditr
    "manufacturer_category", # from ditr
    "ditr_os", # from ditr
    "email_verified",
    "employee_title",
    "employer_department",
    "employer_industry",
    "entryPoint",
    "exposure_band",
    "fulcrum_region",
    "gambling_participation_mc",
    "gender",
    "notify_new_task",
    "notify_task_payout",
    "quality_round10",
    "risk_round10",
    "share_location_data",
    "taskTitle",
    "task_length_of_task",
    "ip_address_count",
]

nonstr_vars_to_encode = [
    "email_verified",
    "ip_address_count",
    "notify_new_task",
    "notify_task_payout",
    "quality_round10",
    "risk_round10",
    "share_location_data",
    "task_length_of_task",
] + [
  col for col in user_info_df if 'days_active_' in col
] + [
  col for col in user_info_df if 'totaal_tasks_' in col
]

user_info_df = batch_one_hot_encode(
    user_info_df,
    columns=str_vars_to_encode,
    as_string_columns=nonstr_vars_to_encode
)

In [0]:
user_info_df_final = filter_to_engineered_features(
    user_info_df, 
    original_columns,
    id_column='respondentPk',
    additional_columns=['risk', 'quality', 'age_YOB', 'days_active', 'task_time_minutes', 'task_completed', 'exposure_band', 'days_active_before_task'],
)

In [0]:
user_info_df_final.display()

In [0]:
break

In [0]:
print(f"Final dataframe: {user_info_df_final.shape[0]} rows, {user_info_df_final.shape[1]} columns")
print(f"Engineered features: {user_info_df_final.shape[1] - 1}")
print(f"\nColumns: {sorted(user_in§bafo_df_final.columns.tolist())}")

In [0]:
user_info_POSTEDA_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('user_info_df_post_eda')))

user_info_df_final.to_parquet(user_info_POSTEDA_path, index=False)