In [0]:
!pip install shap

In [0]:
# Import libraries
import yaml
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from modelling.modelling import (
    run_full_feature_importance_analysis,
    run_full_feature_importance_analysis_fixed,
)

# Load configs
with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("✓ Imports and configs loaded successfully")


In [0]:
# Loading user level dataframe
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

user_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['user_info_df_post_eda']))

wonky_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['wonky_respondent_df']))

test_results_df_input_path = os.path.join(misc_dir,
                           os.path.basename(paths_config['output_files']['test_results_df']))

user_info_df = pd.read_parquet(user_df_input_path) # user level dataframe
wonky_respondent_df = pd.read_parquet(wonky_df_input_path) # user level dataframe wonky_respondents
test_results_df = pd.read_parquet(test_results_df_input_path)

In [0]:
# user_info_df = user_info_df[~user_info_df['exposure_band'].isna()]
user_info_df.shape

In [0]:
significant_features = test_results_df[test_results_df['significant']]['feature'].tolist()
significant_features = [item for item in significant_features if 'is_weekend' not in item]
significant_features

In [0]:
# # Run analysis
# results = run_full_feature_importance_analysis(
#     df=user_info_df,
#     feature_cols=feature_columns,
#     outcome_var='wonky_study_count',
#     user_id_var='respondentPk'
# )

# # Access individual results
# print("\nLinear model R²:", results['linear_model'].rsquared)
# print("Random Forest CV R²:", results['cv_results']['test_r2'].mean())
# print("\nFeature consensus rankings:")
# print(results['comparison'][['feature', 'avg_rank']].head(10))

In [0]:
# V2

# Run fixed analysis
results = run_full_feature_importance_analysis_fixed(
    df=user_info_df,
    feature_cols=significant_features,
    outcome_var='wonky_study_count',
    user_id_var='respondentPk'
)

# Access results
print("\n" + "="*80)
print("FINAL SUMMARY")
print("="*80)
print(f"\nLinear Model R²: {results['linear_model'].rsquared:.4f}")
print(f"Random Forest CV R²: {np.mean(results['cv_results']['test_r2']):.4f}")

print("\nTop 5 Features (consensus):")
print(results['comparison'][['feature', 'avg_rank', 'p_value']].head(5).to_string(index=False))
