In [0]:
!pip install shap

In [0]:
# Import libraries
import yaml
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from modelling.modelling import (
    run_full_feature_importance_analysis_fixed,
)

# Load configs
with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


In [0]:
# Loading user level dataframe
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

user_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))

wonky_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['wonky_respondent_df']))

test_results_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['test_results_df']))

user_info_df = pd.read_parquet(user_df_input_path) # user level dataframe
wonky_respondent_df = pd.read_parquet(wonky_df_input_path) # user level dataframe wonky_respondents
test_results_df = pd.read_parquet(test_results_df_input_path)

In [0]:
# user_info_df = user_info_df[~user_info_df['exposure_band'].isna()]
user_info_df.shape

In [0]:
significant_features = test_results_df[test_results_df['significant']]['feature'].tolist()
significant_features = [item for item in significant_features if 'is_weekend' not in item]
significant_features

In [0]:
# # Run analysis
# results = run_full_feature_importance_analysis(
#     df=user_info_df,
#     feature_cols=feature_columns,
#     outcome_var='wonky_study_count',
#     user_id_var='respondentPk'
# )

# # Access individual results
# print("\nLinear model R²:", results['linear_model'].rsquared)
# print("Random Forest CV R²:", results['cv_results']['test_r2'].mean())
# print("\nFeature consensus rankings:")
# print(results['comparison'][['feature', 'avg_rank']].head(10))

#### stage 1 model

In [0]:


# Run fixed analysis
results = run_full_feature_importance_analysis_fixed(
    df=user_info_df,
    feature_cols=significant_features,
    outcome_var='wonky_study_count',
    user_id_var='respondentPk'
)

# Access results
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print(f"\nLinear Model R²: {results['linear_model'].rsquared:.4f}")
print(f"Random Forest CV R²: {np.mean(results['cv_results']['test_r2']):.4f}")

print("\nTop 5 Features (consensus):")
print(results['comparison'][['feature', 'avg_rank', 'p_value']].head(5).to_string(index=False))


In [0]:
results

In [0]:
# results is your dictionary with all the model outputs

print("="*80)
print("MODEL RESULTS SUMMARY")
print("="*80)

# ============================================================================
# 1. LINEAR MODEL SUMMARY
# ============================================================================
print("\n" + "="*80)
print("1. LINEAR MODEL PERFORMANCE")
print("="*80)

linear_model = results['linear_model']
print(f"R²: {linear_model.rsquared:.4f}")
print(f"Adjusted R²: {linear_model.rsquared_adj:.4f}")
print(f"F-statistic: {linear_model.fvalue:.4f}")
print(f"F-statistic p-value: {linear_model.f_pvalue:.4e}")
print(f"Number of observations: {linear_model.nobs:.0f}")
print(f"Number of features: {linear_model.df_model:.0f}")

# ============================================================================
# 2. LINEAR FEATURE IMPORTANCE
# ============================================================================
print("\n" + "="*80)
print("2. LINEAR MODEL FEATURE IMPORTANCE (Top 15)")
print("="*80)

linear_importance = results['linear_importance'].copy()
linear_importance['p_value_formatted'] = linear_importance['p_value'].apply(lambda x: f"{x:.4e}")
linear_importance['coef_formatted'] = linear_importance['coefficient'].apply(lambda x: f"{x:+.4f}")

display(linear_importance.head(15)[['feature', 'coef_formatted', 'p_value_formatted', 'importance']])

# Show only significant features
sig_linear = linear_importance[linear_importance['p_value'] < 0.01]
print(f"\nSignificant features (p < 0.01): {len(sig_linear)} out of {len(linear_importance)}")
if len(sig_linear) > 0:
    display(sig_linear[['feature', 'coefficient', 'p_value', 't_stat']])

# ============================================================================
# 3. MULTICOLLINEARITY CHECK (VIF)
# ============================================================================
print("\n" + "="*80)
print("3. MULTICOLLINEARITY DIAGNOSTICS (VIF)")
print("="*80)

vif_data = results['vif_data'].copy()
print("\nFeatures with high VIF (> 10 indicates multicollinearity):")

# Filter problematic VIF
vif_high = vif_data[vif_data['VIF'] > 10].copy()
if len(vif_high) > 0:
    print(f"WARNING: {len(vif_high)} features with VIF > 10")
    display(vif_high.head(20))
else:
    print("✓ No severe multicollinearity detected")

print("\nTop 10 features by VIF:")
display(vif_data.head(10))

# ============================================================================
# 4. RANDOM FOREST PERFORMANCE
# ============================================================================
print("\n" + "="*80)
print("4. RANDOM FOREST PERFORMANCE")
print("="*80)

cv_results = results['cv_results']
print(f"Cross-Validation Results (5-fold, cluster-based):")
print(f"  Mean Test R²: {np.mean(cv_results['test_r2']):.4f} ± {np.std(cv_results['test_r2']):.4f}")
print(f"  Mean Train R²: {np.mean(cv_results['train_r2']):.4f} ± {np.std(cv_results['train_r2']):.4f}")
print(f"  Mean Test RMSE: {np.sqrt(np.mean(cv_results['test_mse'])):.4f}")
print(f"  Mean Test MAE: {np.mean(cv_results['test_mae']):.4f}")

# Overfitting check
overfit_gap = np.mean(cv_results['train_r2']) - np.mean(cv_results['test_r2'])
if overfit_gap > 0.1:
    print(f"  WARNING: Possible overfitting detected (gap = {overfit_gap:.4f})")
else:
    print(f"  Overfitting check: OK (gap = {overfit_gap:.4f})")

# Show fold-by-fold results
cv_summary = pd.DataFrame({
    'Fold': range(1, 6),
    'Train R²': cv_results['train_r2'],
    'Test R²': cv_results['test_r2'],
    'Test RMSE': np.sqrt(cv_results['test_mse']),
    'Test MAE': cv_results['test_mae']
})
print("\nFold-by-fold results:")
display(cv_summary)

# ============================================================================
# 5. RANDOM FOREST FEATURE IMPORTANCE
# ============================================================================
print("\n" + "="*80)
print("5. RANDOM FOREST FEATURE IMPORTANCE (Top 15)")
print("="*80)

rf_importance = results['rf_importance'].copy()
rf_importance['importance_pct'] = (rf_importance['importance'] * 100).round(2)

display(rf_importance.head(73))

# Show features with zero importance
zero_importance = rf_importance[rf_importance['importance'] == 0]
print(f"\nFeatures with zero importance: {len(zero_importance)} out of {len(rf_importance)}")

# ============================================================================
# 6. FEATURE COMPARISON (CONSENSUS RANKING)
# ============================================================================
print("\n" + "="*80)
print("6. FEATURE IMPORTANCE COMPARISON (Consensus Ranking)")
print("="*80)

comparison = results['comparison'].copy()
comparison['p_value_formatted'] = comparison['p_value'].apply(lambda x: f"{x:.4e}")
comparison['coef_formatted'] = comparison['coefficient'].apply(lambda x: f"{x:+.4f}")
comparison['rf_imp_pct'] = (comparison['rf_importance'] * 100).round(2)

print("\nTop 20 features by average rank (consensus across both models):")
display(comparison.head(20)[['feature', 'coef_formatted', 'p_value_formatted', 
                             'rf_imp_pct', 'linear_rank', 'rf_rank', 'avg_rank']])

# ============================================================================
# 7. KEY INSIGHTS
# ============================================================================
print("\n" + "="*80)
print("7. KEY INSIGHTS & RECOMMENDATIONS")
print("="*80)

# Model performance comparison
linear_r2 = results['linear_model'].rsquared
rf_r2 = np.mean(cv_results['test_r2'])

print(f"\nModel Performance Comparison:")
print(f"  Linear Model R²: {linear_r2:.4f}")
print(f"  Random Forest R² (CV): {rf_r2:.4f}")

if rf_r2 > linear_r2:
    print(f"  Random Forest performs better (+{rf_r2-linear_r2:.4f} R²)")
    print(f"  Suggests non-linear relationships in the data")
else:
    print(f"  Linear model performs similarly (difference: {linear_r2-rf_r2:.4f})")
    print(f"  Linear relationships dominate")

# Feature consistency
top_linear = set(linear_importance.head(10)['feature'].values)
top_rf = set(rf_importance.head(10)['feature'].values)
overlap = top_linear & top_rf

print(f"\nFeature Importance Consistency:")
print(f"  Top 10 features overlap: {len(overlap)} features")
print(f"  Overlapping features: {', '.join(sorted(overlap))}")

if len(overlap) >= 5:
    print(f"  Good agreement between models")
else:
    print(f"  Low agreement - models capture different patterns")

# Top consensus features
print(f"\nTop 5 Consensus Features (by avg_rank):")
for idx, row in comparison.head(20).iterrows():
    print(f"  {idx+1}. {row['feature']}")
    print(f"     Linear coef: {row['coefficient']:+.4f} (p={row['p_value']:.4e})")
    print(f"     RF importance: {row['rf_importance']:.4f}")
    print(f"     Average rank: {row['avg_rank']:.1f}")

# ============================================================================
# 8. SAVE RESULTS FOR LATER
# ============================================================================
print("\n" + "="*80)
print("8. EXPORT OPTIONS")
print("="*80)

comparison_export = comparison[['feature', 'coefficient', 'p_value', 'rf_importance', 
                                'linear_rank', 'rf_rank', 'avg_rank']].copy()


#### stage 2 model

In [0]:
important_features = rf_importance[rf_importance['importance_pct']>0]['feature']

In [0]:
important_features

In [0]:
importance = rf_importance[rf_importance['importance_pct']>0]

In [0]:
importance

In [0]:
imp_df = importance.sort_values("importance_pct", ascending=False).reset_index(drop=True)

# Helper to build a bar trace for top N
def make_trace(df, top_n, name):
    sub = df.head(top_n)
    return go.Bar(
        x=sub["importance_pct"],
        y=sub["feature"],
        orientation="h",
        name=name,
        text=sub["importance_pct"].round(2).astype(str) + "%",
        textposition="outside"
    )

# Different views
sizes = [5, 10, 20, len(imp_df)]
labels = [f"Top {n}" if n < len(imp_df) else "All" for n in sizes]

traces = [make_trace(imp_df, n, labels[i]) for i, n in enumerate(sizes)]

# Start with Top 10
fig = go.Figure(data=[traces[1]])

# Dropdown to switch number of features
updatemenus = [
    dict(
        type="dropdown",
        x=1.15,
        y=1.0,
        xanchor="left",
        buttons=[
            dict(
                label=labels[i],
                method="update",
                args=[
                    {"x": [imp_df.head(sizes[i])["importance_pct"]],
                     "y": [imp_df.head(sizes[i])["feature"]],
                     "text": [imp_df.head(sizes[i])["importance_pct"].round(2).astype(str) + "%"]},
                    {"title": f"Feature Importance (% of total) - {labels[i]}"}
                ],
            )
            for i in range(len(sizes))
        ],
        showactive=True,
        direction="down"
    )
]

fig.update_layout(
    title="Feature Importance (% of total)",
    xaxis_title="Importance (%)",
    yaxis_title="Feature",
    yaxis=dict(autorange="reversed"),
    margin=dict(l=200, r=20, t=60, b=40),
    updatemenus=updatemenus
)

fig.show()

In [0]:
comparison_export

In [0]:
import plotly.graph_objects as go
import pandas as pd

comparison_df = comparison_export.sort_values('rf_rank').reset_index(drop=True)

# Different views
sizes = [5, 10, 20, len(comparison_df)]
labels = [f"Top {n}" if n < len(comparison_df) else "All" for n in sizes]

# Build initial figure with Top 10
sub = comparison_df.head(sizes[1]).copy()

fig = go.Figure()

# Coefficient bars (simple blue)
fig.add_trace(go.Bar(
    x=sub['coefficient'],
    y=sub['feature'],
    orientation='h',
    name='Coefficient',
    marker=dict(color='#1f77b4'),
    text=[f"{x:+.3f}" for x in sub['coefficient']],
    textposition='outside'
))

# RF importance bars (simple orange)
fig.add_trace(go.Bar(
    x=sub['rf_importance'],
    y=sub['feature'],
    orientation='h',
    name='RF Importance',
    marker=dict(color='#ff7f0e'),
    text=[f"{x:.4f}" for x in sub['rf_importance']],
    textposition='outside'
))

# Dropdown
updatemenus = [
    dict(
        type="dropdown",
        x=1.15,
        y=1.0,
        xanchor="left",
        buttons=[
            dict(
                label=labels[i],
                method="update",
                args=[
                    {
                        'x': [
                            comparison_df.head(sizes[i])['coefficient'].values,
                            comparison_df.head(sizes[i])['rf_importance'].values
                        ],
                        'y': [
                            comparison_df.head(sizes[i])['feature'].values,
                            comparison_df.head(sizes[i])['feature'].values
                        ],
                        'text': [
                            [f"{x:+.3f}" for x in comparison_df.head(sizes[i])['coefficient']],
                            [f"{x:.4f}" for x in comparison_df.head(sizes[i])['rf_importance']]
                        ]
                    },
                    {'title': f"Feature Analysis - {labels[i]}"}
                ],
            )
            for i in range(len(sizes))
        ],
        showactive=True,
        direction="down"
    )
]

fig.update_layout(
    title="Feature Analysis - Top 10",
    xaxis_title='Value',
    yaxis_title='Feature',
    updatemenus=updatemenus,
    barmode='group',
    yaxis=dict(autorange='reversed'),
    margin=dict(l=200, r=100, t=80, b=60),
    height=700,
    hovermode='closest',
    legend=dict(x=1.15, y=0.95)
)

fig.show()
