In [0]:
# Import libraries
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

import numpy as np
import pandas as pd

try:
    import yaml
except ImportError:
    raise ImportError(
        "PyYAML is not installed. Please run the previous cell to install it, "
        "or run: %pip install pyyaml>=6.0"
    )

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from eda.feature_engineering import (
    create_time_features,
    create_task_speed_features,
    create_respondent_behavioral_features,
    add_wonky_features,
    create_fraud_risk_score,
    create_wonky_risk_score
)
from eda.statistical_tests import (
    compare_groups_statistically,
    compare_groups_with_both_tests,
    analyze_thresholds,
    perform_chi_square_tests,
    perform_mannwhitney_tests,
    perform_welch_ttests,
    perform_two_proportion_z_tests,
    compare_demographic_groups
)

from eda.visualizations import (
    create_histogram,
    create_box_plot,
    create_scatter_plot,
    create_bar_plot,
    create_temporal_breakdown_summary,
    create_task_speed_breakdown_summary,
    create_chi_squared_bar_chart,
    create_dual_axis_statistical_chart,
    create_feature_breakdown_table,
    create_distribution_comparison,
    calculate_temporal_feature_deltas,       
    create_chi_squared_delta_dual_axis_chart,
)

# Load configs
with open('../configs/feature_engineering.yaml', 'r') as f:
    feature_config = yaml.safe_load(f)

with open('../configs/statistical_tests.yaml', 'r') as f:
    stats_config = yaml.safe_load(f)

with open('../configs/data_paths.yaml', 'r') as f:
    paths_config = yaml.safe_load(f)

pd.set_option('display.max_columns', None)
print("âœ“ Imports and configs loaded successfully")


In [0]:
notebook_path = os.getcwd() 
repo_root = os.path.abspath(os.path.join(notebook_path, ".."))
misc_dir = os.path.join(repo_root, "misc")

In [0]:
respondent_features_path = os.path.join(misc_dir,
                          os.path.basename(paths_config['output_files'].get('respondent_features')))

df = pd.read_parquet(respondent_features_path)

In [0]:
df

In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def train_rf_and_plot_importance(df, target_col='wonky_study_count', top_n=20):
    """
    Trains a Random Forest to predict wonkiness and plots feature importance.
    
    Args:
        df: Respondent-level dataframe
        target_col: Target variable (e.g., wonky_study_count or binary flag)
        top_n: Number of top features to display
        
    Returns:
        model: Trained RF model
        feature_importance_df: Dataframe of features ranked by importance
    """
    # 1. Prepare Data
    # Convert target to binary for clearer classification signals (Wonky vs Not)
    # OR keep as regression if you want to predict magnitude. Classification is safer for "blocking".
    y = (df[target_col] > 0).astype(int)
    
    # Drop non-feature columns (IDs, raw dates, the target itself)
    # Adjust this exclusion list based on your actual column names
    exclude_cols = ['respondentPk', 'respondent_pk', target_col, 'wonky_task_ratio', 
                    'wonky_task_instances', 'wonky_study_flag', 'total_wonky_studies']
    
    X = df.drop(columns=[c for c in exclude_cols if c in df.columns])
    
    # Handle categorical columns (One-Hot Encoding if not already done)
    # Assuming 'respondent_features' is largely numeric/aggregated already.
    # If there are string columns, we get_dummies them.
    X = pd.get_dummies(X, dummy_na=True)
    X = X.fillna(0) # Simple imputation for RF
    
    # 2. Train Random Forest
    # Using balanced class_weight to handle the rarity of wonky users
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None, # Let trees grow to find interactions
        min_samples_leaf=5, # Prevent overfitting to single users
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
        oob_score=True
    )
    
    rf.fit(X, y)
    
    # 3. Extract Feature Importance (Gini Importance)
    importances = rf.feature_importances_
    feature_names = X.columns
    
    feat_imp_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # 4. Print Model Quality Metrics
    print(f"Random Forest OOB Score (Accuracy): {rf.oob_score_:.4f}")
    
    # 5. Output for User
    return rf, feat_imp_df.head(top_n)

# Example usage (commented out until dataframe is loaded):
# rf_model, top_features = train_rf_and_plot_importance(respondent_features)
# print(top_features)

In [0]:
df.info()

In [0]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Prepare Target
y = (df['wonky_study_count'] > 0).astype(int)

# Drop ALL non-feature columns (Dates, IDs, Targets)
cols_to_drop = [
    'respondentPk',              # <--- THIS WAS THE CULPRIT
    'first_task_date', 
    'last_task_date',
    'wonky_study_count', 
    'wonky_task_ratio', 
    'is_high_wonky', 
    'is_quite_wonky'
]

X = df.drop(columns=[c for c in cols_to_drop if c in df.columns])

# One-Hot Encode remaining categories
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)

# Train
rf = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced', 
    random_state=42, 
    n_jobs=-1, 
    oob_score=True,
    max_depth=15  # Limit tree depth to prevent memorization
)
rf.fit(X, y)

# Show Importance
imp = pd.DataFrame({
    'feature': X.columns, 
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print(imp.head(20))
print(f"OOB Accuracy: {rf.oob_score_:.4f}")


In [0]:
train_rf_and_plot_importance(df)