In [None]:
# Import libraries
import pprint
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import yaml

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.statistical_tests import (
    OLS_with_cluster_robust_test,
    logistic_regression_with_cluster_robust_test,
    run_combined_regression_tests,
    format_ols_for_stakeholders,
    format_odds_ratios_for_stakeholders,
    generate_testing_executive_summary,
)

from eda.visualizations import (
    create_breakdown_summary,
    create_breakdown_chart,
    create_coefficient_forest_plot,
    create_odds_ratio_plot,
)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")

In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

user_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))

user_info_df = pd.read_parquet(user_df_input_path) 

In [0]:
user_info_df = user_info_df[~user_info_df['exposure_band'].isna()].reset_index(drop=True)

#### Statistical Testing

Run OLS and Logistic regression tests on all feature sets to identify significant predictors of wonkiness.

In [None]:
# Initialize results list and run tests
results = []
feature_sets = stats_config["feature_sets"]

for feature_set_name in feature_sets:
    print('--'*50)
    print(f"Working on {feature_set_name}")
    print('--'*50)
    
    feature_list = stats_config["feature_sets"][feature_set_name]
    
    # Run both tests at once (parallelized)
    combined_df = run_combined_regression_tests(
        user_info_df,
        feature_set=feature_list,
        outcome_var="wonky_study_count",
        user_id_var="respondentPk",
        significance_level=0.05,
        n_jobs=-1
    )
    
    combined_df["feature_set"] = feature_set_name
    combined_df = combined_df.reset_index()
    results.append(combined_df)

results_df = pd.concat(results, ignore_index=True)
results_df = results_df.set_index(["feature_set", "feature"])

In [0]:
print("=" * 70)
print("SIGNIFICANT IN BOTH OLS AND LOGISTIC (High Confidence)")
print("=" * 70)
sig_both = results_df[results_df['significant_both'] == True].reset_index()
print(f"Found {len(sig_both)} features significant in both tests")
sig_both[['feature_set', 'feature', 
          'ols_mean_difference', 'ols_p_value', 'ols_cohens_d',
          'logit_odds_ratio', 'logit_p_value']].display()

print("\n" + "=" * 70)
print("LARGE EFFECT SIZE IN BOTH (Most Important)")
print("=" * 70)
large_effect = results_df[
    (results_df['ols_effect_size_interpretation'].isin(['medium', 'large'])) &
    (results_df['logit_effect_size_interpretation'].isin(['medium', 'large']))
].reset_index()
print(f"Found {len(large_effect)} features with large effect in both")

print("\n" + "=" * 70)
print("DISCREPANCIES: Significant in one test only")
print("=" * 70)
discrepant = results_df[
    results_df['significant_either'] & ~results_df['significant_both']
].reset_index()
print(f"Found {len(discrepant)} discrepant features")

In [0]:
modelling_features = sig_both['feature'].tolist()
print(f"\nFeatures to use in modelling: {len(modelling_features)}")
print(modelling_features)


#### Stakeholder-Friendly Results

Format results for non-technical stakeholders with easy-to-interpret metrics.

In [None]:
# Format OLS results for stakeholders
baseline_mean = user_info_df['wonky_study_count'].mean()
ols_stakeholder = format_ols_for_stakeholders(
    results_df.reset_index(),
    baseline_mean=baseline_mean,
)

print("=" * 70)
print("OLS RESULTS - STAKEHOLDER VIEW")
print("=" * 70)
print(f"Baseline wonkiness rate: {baseline_mean:.3f}")
print("\nInterpretation: % Change shows how much wonkiness changes when feature = 1 vs 0\n")

display(ols_stakeholder[['feature', 'ols_mean_difference', 'pct_change', 'ci_95', 
                         'interpretation', 'impact_category']].head(20))

In [None]:
# Format Odds Ratios for stakeholders
or_stakeholder = format_odds_ratios_for_stakeholders(results_df.reset_index())

print("=" * 70)
print("LOGISTIC REGRESSION - ODDS RATIOS (STAKEHOLDER VIEW)")
print("=" * 70)
print("""
Interpretation:
  • OR > 1: Feature INCREASES likelihood of being wonky
  • OR < 1: Feature DECREASES likelihood of being wonky
  • OR = 1: No effect
""")

display(or_stakeholder[['feature', 'logit_odds_ratio', 'or_formatted', 
                        'interpretation', 'impact_category', 'direction']].head(20))

#### Forest Plots

Visual representations of coefficients and odds ratios with confidence intervals.

In [None]:
# OLS Coefficient Forest Plot
fig_ols = create_coefficient_forest_plot(
    ols_stakeholder,
    coefficient_col='ols_mean_difference',
    ci_lower_col='ci_lower',
    ci_upper_col='ci_upper',
    p_value_col='ols_p_value',
    title="OLS Coefficients with 95% Confidence Intervals",
    max_features=15,
)
fig_ols.show()

In [None]:
# Odds Ratio Forest Plot
fig_or = create_odds_ratio_plot(
    or_stakeholder,
    or_col='logit_odds_ratio',
    ci_lower_col='or_ci_lower',
    ci_upper_col='or_ci_upper',
    p_value_col='logit_p_value',
    title="Odds Ratios with 95% Confidence Intervals",
    max_features=15,
)
fig_or.show()

#### Executive Summary

High-level summary of key findings for stakeholders.

In [None]:
# Generate Executive Summary
summary = generate_testing_executive_summary(results_df.reset_index())
print(summary)

In [0]:
results_df.reset_index().display()

In [None]:
# Export results for use in modelling notebook
test_results_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('test_results_df')))

results_df.reset_index().to_parquet(test_results_path, index=False)
print(f"Results exported to: {test_results_path}")