In [0]:
# !pip install shap

In [0]:
import yaml
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from modelling.modelling import (
    build_linear_baseline,
    build_logistic_regression_model,
    build_random_forest_model,
    run_all_models,
)

from modelling.modelling_utils import (
    remove_collinear_features,
    run_shap_analysis,
    create_stakeholder_report,
    print_model_summary,
    get_significant_features,
    export_results,
    plot_shap_summary,
    plot_shap_bar,
    plot_shap_dependence,
)

# Load configs
with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("‚úì Imports and configs loaded successfully")

In [0]:
# Loading user level dataframe
notebook_path = os.getcwd()
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

user_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))

wonky_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['wonky_respondent_df']))

test_results_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['test_results_df']))

user_info_df = pd.read_parquet(user_df_input_path)
wonky_respondent_df = pd.read_parquet(wonky_df_input_path)
test_results_df = pd.read_parquet(test_results_df_input_path)

user_info_df = user_info_df[~user_info_df['wonky_study_count'].isna()]
print(f"‚úì Data loaded: {user_info_df.shape}")

In [0]:
user_info_df = user_info_df[~user_info_df['wonky_study_count'].isna()]
user_info_df.shape

In [0]:
significant_features = test_results_df[test_results_df['significant_both']]['feature'].tolist()
significant_features = [item for item in significant_features if 'is_weekend' not in item]
significant_features = significant_features + ['risk', 'quality']

print(f"Features from testing: {len(significant_features)}")
print(significant_features[:10])  # Show first 10

In [0]:
# adhoc variable removal (for example risk dummies -> can use whole series)

#### stage 1 model

In [0]:
results = run_all_models(
    df=user_info_df,
    feature_cols=significant_features,
    outcome_var="wonky_study_count",
    user_id_var="respondentPk",
    include_logistic=True,
    include_vif=True,
    rf_n_estimators=100,
    lr_regularization_C=1.0,
)

In [0]:
results = run_shap_analysis(
    results=results,
    df=user_info_df,
    sample_size=1000,
)

In [0]:
display_cols = ['feature', 'linear_coef', 'linear_p_value', 'rf_importance_pct']
if 'lr_odds_ratio' in comparison.columns:
    display_cols.extend(['lr_odds_ratio', 'lr_p_value'])
if 'shap_direction' in comparison.columns:
    display_cols.extend(['shap_importance', 'shap_direction'])
display_cols.extend(['avg_rank', 'VIF'])

print("\nTop 20 Features by Consensus Rank:")
display(comparison.head(20)[display_cols])

In [0]:
print("\n" + "="*80)
print("üìà LINEAR REGRESSION RESULTS")
print("="*80)
print("Interpretation: Coefficient = change in wonky_study_count when feature=1 vs 0")
print("Stars: *** p<0.001, ** p<0.01, * p<0.05\n")

linear_df = results['linear_importance'].copy()
linear_df['sig'] = linear_df['p_value'].apply(
    lambda x: '***' if x < 0.001 else ('**' if x < 0.01 else ('*' if x < 0.05 else ''))
)
linear_df['coef_display'] = linear_df.apply(
    lambda row: f"{row['coefficient']:+.4f}{row['sig']}", axis=1
)

display(linear_df.head(20)[['feature', 'coef_display', 'p_value', 'importance']])


In [0]:
if 'lr_importance' in results:
    print("\n" + "="*80)
    print("üéØ LOGISTIC REGRESSION - ODDS RATIOS")
    print("="*80)
    print("""
Odds Ratio Interpretation:
  ‚Ä¢ OR = 1.0:  No effect
  ‚Ä¢ OR = 1.5:  50% more likely to be wonky when feature=1
  ‚Ä¢ OR = 2.0:  2x more likely (100% increase)
  ‚Ä¢ OR = 0.5:  50% less likely (half the odds)
""")
    
    lr_df = results['lr_importance'].copy()
    
    # Add interpretation
    def interpret_or(x):
        if pd.isna(x) or x <= 0 or x > 100 or x < 0.01:
            return "‚ö†Ô∏è Extreme"
        if abs(x - 1) < 0.05:
            return "No effect"
        if x > 1:
            pct = (x - 1) * 100
            return f"‚Üë {pct:.0f}% more likely" if pct <= 100 else f"‚Üë {x:.1f}x"
        else:
            return f"‚Üì {(1-x)*100:.0f}% less likely"
    
    lr_df['interpretation'] = lr_df['lr_odds_ratio'].apply(interpret_or)
    
    # Filter valid odds ratios
    valid_lr = lr_df[(lr_df['lr_odds_ratio'] >= 0.01) & (lr_df['lr_odds_ratio'] <= 100)]
    
    display(valid_lr.head(20)[['feature', 'lr_coefficient', 'lr_odds_ratio', 
                               'interpretation', 'lr_p_value']])

In [0]:
print("\n" + "="*80)
print("üå≤ RANDOM FOREST IMPORTANCE")
print("="*80)

rf_df = results['rf_importance'].copy()
display(rf_df.head(20)[['feature', 'rf_importance', 'rf_importance_pct']])

In [0]:
if 'shap_importance' in results:
    print("\n" + "="*80)
    print("üîÆ SHAP FEATURE IMPORTANCE")
    print("="*80)
    print("""
SHAP shows HOW features affect predictions:
  ‚Ä¢ shap_importance: How much the feature matters
  ‚Ä¢ shap_mean: Direction (+ increases wonky, - decreases)
  ‚Ä¢ shap_direction: Plain English interpretation
""")
    
    shap_df = results['shap_importance'].copy()
    display(shap_df.head(20)[['feature', 'shap_importance', 'shap_mean', 'shap_direction']])

In [0]:
# Beeswarm plot - shows direction and distribution
plot_shap_summary(results, max_display=20)

In [0]:
plot_shap_bar(results, max_display=20)

In [0]:
# to explore specific features using shap

# How does is_saturday affect predictions?
plot_shap_dependence(results, 'is_saturday')

# With interaction coloring
plot_shap_dependence(results, 'is_saturday', interaction_feature='is_early_morning')

In [0]:
print("\n" + "="*80)
print("üîç MULTICOLLINEARITY CHECK (VIF)")
print("="*80)
print("""
VIF Interpretation:
  ‚Ä¢ VIF < 5:  ‚úì No concern
  ‚Ä¢ VIF 5-10: ‚ö° Moderate
  ‚Ä¢ VIF > 10: ‚ö†Ô∏è High - consider removing
""")

vif_df = results['vif_data'].copy()
vif_df['status'] = vif_df['VIF'].apply(
    lambda x: '‚úì OK' if x < 5 else ('‚ö° Moderate' if x < 10 else '‚ö†Ô∏è HIGH')
)

high_vif = vif_df[vif_df['VIF'] >= 5]
if len(high_vif) > 0:
    print(f"Features with VIF >= 5:")
    display(high_vif[['feature', 'VIF', 'status']])
else:
    print("‚úì All features have VIF < 5")

In [0]:
print("\n" + "="*80)
print("üìã STAKEHOLDER REPORT")
print("="*80)

stakeholder_report = create_stakeholder_report(results)
display(stakeholder_report.head(20))

In [0]:
print("\n" + "="*80)
print("üí° KEY INSIGHTS")
print("="*80)

# Top consensus features
print("\nüèÜ TOP 5 CONSENSUS FEATURES:")
for i, (_, row) in enumerate(results['comparison'].head(5).iterrows(), 1):
    print(f"\n{i}. {row['feature']}")
    print(f"   Linear coef: {row['linear_coef']:+.4f} (p={row['linear_p_value']:.4e})")
    print(f"   RF importance: {row['rf_importance_pct']:.2f}%")
    if 'lr_odds_ratio' in row and pd.notna(row['lr_odds_ratio']):
        print(f"   Odds Ratio: {row['lr_odds_ratio']:.3f}")
    if 'shap_direction' in row:
        print(f"   SHAP: {row['shap_direction']}")

# Features that increase vs decrease wonky
if 'shap_importance' in results:
    shap_df = results['shap_importance']
    
    increases = shap_df[shap_df['shap_mean'] > 0.001].head(5)
    decreases = shap_df[shap_df['shap_mean'] < -0.001].head(5)
    
    print("\nüî¥ FEATURES THAT INCREASE WONKY:")
    for _, row in increases.iterrows():
        print(f"   {row['feature']}: +{row['shap_mean']:.4f}")
    
    print("\nüîµ FEATURES THAT DECREASE WONKY:")
    for _, row in decreases.iterrows():
        print(f"   {row['feature']}: {row['shap_mean']:.4f}")

In [0]:
print("\n" + "="*80)
print("üéØ HIGH-CONFIDENCE FEATURES")
print("="*80)

high_confidence = get_significant_features(results, p_threshold=0.05, min_rf_importance=0.01)
print(f"\nFeatures significant in Linear + top RF importance: {len(high_confidence)}")
for f in sorted(high_confidence):
    print(f"  ‚úì {f}")

In [0]:
print("\n" + "="*80)
print("üíæ EXPORT")
print("="*80)

# Export to CSV
export_df = export_results(results, 'feature_importance_results.csv')

# Also save as parquet for downstream use
results['comparison'].to_parquet('modelling_comparison.parquet', index=False)
print("‚úì Results saved to modelling_comparison.parquet")

In [0]:
# Interactive Plotly chart comparing Linear vs RF importance

comparison_df = results['comparison'].head(20).copy()

fig = go.Figure()

# Linear coefficients
fig.add_trace(go.Bar(
    x=comparison_df['linear_coef'],
    y=comparison_df['feature'],
    orientation='h',
    name='Linear Coefficient',
    marker=dict(color='#1f77b4'),
))

# RF importance (scaled for visibility)
rf_scale = comparison_df['linear_coef'].abs().max() / comparison_df['rf_importance'].max()
fig.add_trace(go.Bar(
    x=comparison_df['rf_importance'] * rf_scale,
    y=comparison_df['feature'],
    orientation='h',
    name=f'RF Importance (scaled)',
    marker=dict(color='#ff7f0e'),
))

fig.update_layout(
    title="Top 20 Features: Linear Coefficient vs RF Importance",
    xaxis_title='Value',
    yaxis_title='Feature',
    barmode='group',
    yaxis=dict(autorange='reversed'),
    height=600,
    margin=dict(l=200),
)

fig.show()

In [0]:
# The 'results' dictionary contains everything:
# - results['linear_model']: OLS model
# - results['rf_model']: Random Forest model  
# - results['lr_model']: Logistic Regression model
# - results['comparison']: Unified comparison table
# - results['shap_values']: SHAP values array
# - results['shap_explainer']: SHAP explainer object