In [0]:
# Import libraries
import pprint
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.statistical_tests import (
    OLS_with_cluster_robust_test,
    logistic_regression_with_cluster_robust_test,
    run_combined_regression_tests,
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

user_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))

# user_df_max_input_path = os.path.join(misc_dir,
#                            os.path.basename(paths_config['output_files']['user_info_df']))

wonky_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['wonky_respondent_df']))

user_info_df = pd.read_parquet(user_df_input_path) 
# user_info_df_max = pd.read_parquet(user_df_max_input_path) 
wonky_respondent_df = pd.read_parquet(wonky_df_input_path) 

#### Invidual testing

In [0]:
# list of feature_sets in config
pprint.pp(stats_config['feature_sets'])

In [0]:
# specifically picking temporal ones out
set_name = 'share_location_data'
feature_set = stats_config['feature_sets'][set_name]

print(
    create_breakdown_summary(
        user_info_df, # user level dataframe 
        features=feature_set, # features to test
        group_col="wonky_study_count", # outcome: should be binary outcome
    )
)

fig = create_breakdown_chart(
    user_info_df,
    features=feature_set,
    group_col="wonky_study_count",
    group_threshold=0)
fig.show()

ols_df = OLS_with_cluster_robust_test(
    user_info_df,
    feature_set=feature_set,
    outcome_var="wonky_study_count",
    user_id_var="respondentPk",
    significance_level=0.05,
)
ols_df["feature_set"] = set_name
ols_df["test_type"] = "OLS"
ols_df = ols_df.reset_index()

logit_df = logistic_regression_with_cluster_robust_test(
    user_info_df,
    feature_set=feature_set,
    outcome_var="wonky_study_count",
    user_id_var="respondentPk",
    significance_level=0.05,
)
logit_df["feature_set"] = set_name
logit_df["test_type"] = "Logistic"
logit_df = logit_df.reset_index()

ols_df.display()
logit_df.display()

In [0]:
# item = (user_info_df[feature_set + ['exposure_band']].groupby('exposure_band').sum()/len(user_info_df)).T
# item['delta (e-c)'] = item['exposed'] - item['control']
# item

In [0]:
# df = item.reset_index()

# time_order = [
#     "is_12am", "is_1am", "is_2am", "is_3am", "is_4am", "is_5am",
#     "is_6am", "is_7am", "is_8am", "is_9am", "is_10am", "is_11am",
#     "is_12pm", "is_1pm", "is_2pm", "is_3pm", "is_4pm", "is_5pm",
#     "is_6pm", "is_7pm", "is_8pm", "is_9pm", "is_10pm", "is_11pm"
# ]

# df["index"] = pd.Categorical(df["index"], categories=time_order, ordered=True)

# df = df.sort_values("index")

# fig_bar = px.bar(
#     df,
#     x="index",
#     y="delta (e-c)",
#     title="Delta (Exposed - Control) by Time Band",
# )

# fig_bar.update_layout(
#     xaxis_title="Time band",
#     yaxis_title="Delta (E − C)",
#     template="plotly_white",
# )

# fig_bar.add_hline(y=0, line_dash="dash", line_color="black")
# fig_bar.show()

In [0]:
# user_info_df[feature_set].sum()/len(user_info_df)

#### Mass Testing

In [0]:
# results = []

# for feature_set_name in stats_config["feature_sets"]:
#     print(f"Working on {feature_set_name}")
    
#     feature_list = stats_config["feature_sets"][feature_set_name]
    
#     feature_df = OLS_with_cluster_robust_test(
#         user_info_df,
#         feature_set=feature_list,
#         outcome_var="wonky_study_count",
#         user_id_var="respondentPk",
#         significance_level=0.05,
#     )
    
#     feature_df["feature_set"] = feature_set_name
#     feature_df["outcome"] = "wonky_study_count"
    
#     feature_df = feature_df.reset_index()
#     results.append(feature_df)

# results_df = pd.concat(results, ignore_index=True)

# results_df = results_df.set_index(["feature_set", "feature", "outcome"])


# print("SIGNIFICANT RESULTS (p < 0.01)")
# sig_results = results_df[results_df['significant'] == True].reset_index()
# sig_results[['feature_set', 'feature', 'mean_difference', 'p_value', 't_statistic']].display()

In [0]:
# results_ols = []
# results_logit = []

# for feature_set_name in stats_config["feature_sets"]:
#     print(f"Working on {feature_set_name}")
    
#     feature_list = stats_config["feature_sets"][feature_set_name]
    
#     # OLS test
#     ols_df = OLS_with_cluster_robust_test(
#         user_info_df,
#         feature_set=feature_list,
#         outcome_var="wonky_study_count",
#         user_id_var="respondentPk",
#         significance_level=0.05,
#     )
#     ols_df["feature_set"] = feature_set_name
#     ols_df["test_type"] = "OLS"
#     ols_df = ols_df.reset_index()
#     results_ols.append(ols_df)
    
#     # Logistic test
#     logit_df = logistic_regression_with_cluster_robust_test(
#         user_info_df,
#         feature_set=feature_list,
#         outcome_var="wonky_study_count",
#         user_id_var="respondentPk",
#         significance_level=0.05,
#     )
#     logit_df["feature_set"] = feature_set_name
#     logit_df["test_type"] = "Logistic"
#     logit_df = logit_df.reset_index()
#     results_logit.append(logit_df)

# # Combine results
# ols_results_df = pd.concat(results_ols, ignore_index=True)
# logit_results_df = pd.concat(results_logit, ignore_index=True)

# # Merge OLS and Logistic results for side-by-side comparison
# combined_results = ols_results_df.merge(
#     logit_results_df,
#     on=['feature_set', 'feature'],
#     suffixes=('_ols', '_logit'),
#     how='outer'
# )

# # Add combined significance flags
# combined_results['significant_both'] = (
#     combined_results['significant_ols'].fillna(False) & 
#     combined_results['significant_logit'].fillna(False)
# )
# combined_results['significant_either'] = (
#     combined_results['significant_ols'].fillna(False) | 
#     combined_results['significant_logit'].fillna(False)
# )

In [0]:
stats_config["feature_sets"]

In [0]:
risk_set = {'risk':[col for col in user_info_df.columns if 'risk' in col and '_' in col]}
quality_set = {'quality':[col for col in user_info_df.columns if 'quality' in col and '_' in col]}

added_cols = ['age_YOB', 'days_active', 'task_time_minutes', 'totaal_tasks_completed']
misc_set = {'misc': added_cols}

In [0]:
results = []
feature_sets = stats_config["feature_sets"] | risk_set | quality_set + ['']

for feature_set_name in feature_sets:
    print(f"Working on {feature_set_name}")
    
    feature_list = stats_config["feature_sets"][feature_set_name]
    
    # Run both tests at once
    combined_df = run_combined_regression_tests(
        user_info_df,
        feature_set=feature_list,
        outcome_var="wonky_study_count",
        user_id_var="respondentPk",
        significance_level=0.05,
    )
    
    combined_df["feature_set"] = feature_set_name
    combined_df = combined_df.reset_index()
    results.append(combined_df)

results_df = pd.concat(results, ignore_index=True)
results_df = results_df.set_index(["feature_set", "feature"])

In [0]:
print("=" * 70)
print("SIGNIFICANT IN BOTH OLS AND LOGISTIC (High Confidence)")
print("=" * 70)
sig_both = results_df[results_df['significant_both'] == True].reset_index()
print(f"Found {len(sig_both)} features significant in both tests")
sig_both[['feature_set', 'feature', 
          'ols_mean_difference', 'ols_p_value', 'ols_cohens_d',
          'logit_odds_ratio', 'logit_p_value']].display()

print("\n" + "=" * 70)
print("LARGE EFFECT SIZE IN BOTH (Most Important)")
print("=" * 70)
large_effect = results_df[
    (results_df['ols_effect_size_interpretation'].isin(['medium', 'large'])) &
    (results_df['logit_effect_size_interpretation'].isin(['medium', 'large']))
].reset_index()
print(f"Found {len(large_effect)} features with large effect in both")

print("\n" + "=" * 70)
print("DISCREPANCIES: Significant in one test only")
print("=" * 70)
discrepant = results_df[
    results_df['significant_either'] & ~results_df['significant_both']
].reset_index()
print(f"Found {len(discrepant)} discrepant features")

In [0]:
large_effect

In [0]:
discrepant

In [0]:
results_df

In [0]:
modelling_features = sig_both['feature'].tolist()
print(f"\nFeatures to use in modelling: {len(modelling_features)}")
print(modelling_features)


In [0]:
results_df.reset_index().display()

In [0]:
test_results_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('test_results_df')))

results_df.reset_index().to_parquet(test_results_path, index=False)