In [1]:
# import required libraries
import pandas as pd

In [9]:
# read the data
data_df = pd.read_csv('250807_FinalCleanedData_PT.csv')
data_df['di_code'] = data_df['di_code'].astype(int)

# filter the data
print(data_df.keys())
data_df.head()

Index(['Unnamed: 0.1', 'caseid', 'weight', 'wave', 'state', 'district',
       'urban', 'gender', 'age', 'caste',
       ...
       'A63_p2', 'A63_p90', 'A63_p95', 'A63_p98', 'A63_sum', 'A63_mean',
       'A63_min', 'A63_max', 'A63_count', 'split'],
      dtype='object', length=608)


Unnamed: 0,Unnamed: 0.1,caseid,weight,wave,state,district,urban,gender,age,caste,...,A63_p2,A63_p90,A63_p95,A63_p98,A63_sum,A63_mean,A63_min,A63_max,A63_count,split
0,0,10001,0.840547,2024,NE,East Khasi Hills,Urban,Male,18-29,Scheduled Castes/Tribes,...,-0.124567,0.103406,0.124567,0.147697,43582.604336,0.012406,-0.267958,0.327812,3519450,train
1,1,10002,1.360886,2024,UT,Leh,Rural,Female,45+,Scheduled Castes/Tribes,...,-0.206936,0.093564,0.14173,0.267958,-117536.992772,-0.024344,-0.318893,0.346021,4837249,train
2,2,10002,1.360886,2024,UT,Leh,Rural,Female,45+,Scheduled Castes/Tribes,...,-0.221453,0.103406,0.130165,0.16,-248714.921995,-0.038288,-0.327812,0.318893,6508506,train
3,3,10003,0.640417,2024,UT,Daman & Diu,Rural,Male,30-44,Other Castes,...,0.032541,0.221453,0.236463,0.244152,10633.937208,0.124858,-0.051734,0.355309,86148,train
4,4,10003,0.640417,2024,UT,Daman & Diu,Rural,Male,30-44,Other Castes,...,-0.022207,0.206936,0.244152,0.2599,4060.501217,0.087937,-0.113741,0.355309,47012,train


## Define column names for different models

In [10]:
# create keys for all four models
features_list_model1 = ['age', 'gender', 'urban', 'caste', 'di_code']
features_list_model2 = features_list_model1 + ['mean_monthly_avg_rd_MD', 'precip_sum_mm', 'precip_mean_mm', 'mean_tmax', 'district_flood_sentiment', 'state_flood_sentiment']
features_list_model3 = features_list_model1 + [
    f'{var}_{metric}'
    for var in sorted(set([
        data.split('_')[0] 
        for data in data_df.loc[:, data_df.columns.str.startswith('A')].keys()
    ]))
    for metric in ['min', 'p2', 'mean', 'p98', 'max', 'sum', 'count']
]
features_list_model4 = list(set(features_list_model2 + features_list_model3))

flood_y_key = 'n7fy23_recode'
drought_y_key = 'n7dy23_recode'

## Prepare data for the model

In [14]:
%%time

%%time
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables FIRST (on full dataset)
data_encoded = data_df.copy()
encoders = {}

for var in ['age', 'gender', 'urban', 'caste', 'di_code']:
    encoders[var] = LabelEncoder()
    data_encoded[var] = encoders[var].fit_transform(data_df[var])

# Split the encoded data (general variables)
X_train_encoded = data_encoded[data_encoded['split'] == 'train']
X_test_encoded = data_encoded[data_encoded['split'] == 'test']
y_train = data_encoded[data_encoded['split'] == 'train'][flood_y_key]
y_test = data_encoded[data_encoded['split'] == 'test'][flood_y_key]

print(f"X_train_encoded shape: {X_train_encoded.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test_encoded shape: {X_test_encoded.shape}")
print(f"y_test shape: {y_test.shape}")

X_train_encoded shape: (9376, 608)
y_train shape: (9376,)
X_test_encoded shape: (2047, 608)
y_test shape: (2047,)
CPU times: user 40.3 ms, sys: 39 ms, total: 79.2 ms
Wall time: 76.9 ms
CPU times: user 41.2 ms, sys: 39.1 ms, total: 80.3 ms
Wall time: 78 ms


In [12]:
import time
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

def train_models_with_features(X_train_encoded, X_test_encoded, y_train, y_test, 
                               features_list, model_name_suffix=""):
    """
    Train multiple models with specified features and save results
    
    Parameters:
    -----------
    X_train_encoded : DataFrame
        Training features (encoded)
    X_test_encoded : DataFrame  
        Test features (encoded)
    y_train : array-like
        Training target
    y_test : array-like
        Test target
    features_list : list
        List of feature names to use
    model_name_suffix : str
        Suffix to add to saved model files (e.g., "model1", "model2")
    
    Returns:
    --------
    tuple: (metrics_df, trained_models, feature_importance_results)
    """
    
    print(f"{'='*60}")
    print(f"Training models with {len(features_list)} features")
    if model_name_suffix:
        print(f"Model suffix: {model_name_suffix}")
    print(f"{'='*60}")
    
    # Extract features
    X_train_subset = X_train_encoded[features_list]
    X_test_subset = X_test_encoded[features_list]
    
    # Define models and parameters
    models = {
        'RandomForest': (RandomForestClassifier(random_state=42), {
            'n_estimators': [10, 50, 100, 200], 'max_depth': [10, 20, None]}),
        
        'GradientBoosting': (GradientBoostingClassifier(random_state=42), {
            'n_estimators': [100, 200], 'learning_rate': [0.1, 0.2], 'max_depth': [3, 5]}),
        
        'LogisticRegression': (LogisticRegression(random_state=42, max_iter=1000), {
            'C': [0.1, 1, 10], 'penalty': ['l1', 'l2'], 'solver': ['liblinear']})
    }
    
    results = []
    trained_models = {}
    feature_importance_results = {}
    
    # Train each model
    for name, (model, params) in models.items():
        print(f"Training {name}...")
        start = time.time()
        
        grid = GridSearchCV(model, params, cv=5, scoring='accuracy')
        grid.fit(X_train_subset, y_train)
        
        # Store the best estimator
        trained_models[name] = grid.best_estimator_
        
        # Make predictions
        train_pred = grid.predict(X_train_subset)
        test_pred = grid.predict(X_test_subset)
        train_proba = grid.predict_proba(X_train_subset)[:, 1]
        test_proba = grid.predict_proba(X_test_subset)[:, 1]
        
        # Calculate metrics
        results.append({
            'Model': name,
            'Train_Accuracy': accuracy_score(y_train, train_pred),
            'Test_Accuracy': accuracy_score(y_test, test_pred),
            'Train_Precision': precision_score(y_train, train_pred),
            'Test_Precision': precision_score(y_test, test_pred),
            'Train_Recall': recall_score(y_train, train_pred),
            'Test_Recall': recall_score(y_test, test_pred),
            'Train_F1': f1_score(y_train, train_pred),
            'Test_F1': f1_score(y_test, test_pred),
            'Train_AUC': roc_auc_score(y_train, train_proba),
            'Test_AUC': roc_auc_score(y_test, test_proba)
        })
        
        print(f"{name} - Test Acc: {results[-1]['Test_Accuracy']:.3f}, Test AUC: {results[-1]['Test_AUC']:.3f}")
        print(f"Time: {time.time()-start:.1f}s\n")
    
    # Create metrics DataFrame
    metrics_df = pd.DataFrame(results)
    
    # Print results nicely
    print("FINAL RESULTS:")
    display_df = metrics_df.round(3).set_index('Model').T
    print(display_df)
    print()
    
    # Extract feature importance
    print("Extracting feature importance...")
    for model_name, trained_model in trained_models.items():
        importance_df = get_feature_importance(trained_model, model_name, features_list)
        if importance_df is not None:
            feature_importance_results[model_name] = importance_df
            print(f"{model_name} - Top 5 features:")
            print(importance_df.head(5)[['Feature', 'Importance']].to_string(index=False))
            print()
    
    # Save everything
    save_results(metrics_df, trained_models, feature_importance_results, 
                 features_list, model_name_suffix)
    
    return metrics_df, trained_models, feature_importance_results

def get_feature_importance(model, model_name, feature_names):
    """Extract feature importance based on model type"""
    
    if hasattr(model, 'feature_importances_'):
        # Tree-based models
        importance = model.feature_importances_
        importance_type = 'Gini/Entropy Importance'
        
    elif hasattr(model, 'coef_'):
        # Linear models
        importance = np.abs(model.coef_[0])
        importance_type = 'Coefficient Magnitude'
        
    else:
        print(f"Feature importance not available for {model_name}")
        return None
    
    # Create feature importance DataFrame
    feature_importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importance,
        'Model': model_name,
        'Importance_Type': importance_type
    }).sort_values('Importance', ascending=False)
    
    return feature_importance_df

def save_results(metrics_df, trained_models, feature_importance_results, 
                 features_list, model_name_suffix):
    """Save all results to files"""
    
    # Create directory
    save_dir = 'saved_models'
    os.makedirs(save_dir, exist_ok=True)
    
    # Create file suffix
    suffix = f"_{model_name_suffix}" if model_name_suffix else ""
    
    # Save models
    for model_name, trained_model in trained_models.items():
        joblib.dump(trained_model, f'{save_dir}/{model_name}_model{suffix}.pkl')
    
    # Save feature importance
    for model_name, importance_df in feature_importance_results.items():
        importance_df.to_csv(f'{save_dir}/{model_name}_feature_importance{suffix}.csv', index=False)
    
    # Save metrics and features
    metrics_df.to_csv(f'{save_dir}/model_metrics{suffix}.csv', index=False)
    joblib.dump(features_list, f'{save_dir}/feature_list{suffix}.pkl')
    
    print(f"Results saved with suffix '{suffix}' in '{save_dir}/' directory")




In [17]:
%%time

# Train with first set of features
metrics1, models1, importance1 = train_models_with_features(
    X_train_encoded[features_list_model1], X_test_encoded[features_list_model1], y_train, y_test, 
    features_list_model1, "model1"
)

# Train with second set of features  
metrics2, models2, importance2 = train_models_with_features(
    X_train_encoded[features_list_model2], X_test_encoded[features_list_model2], y_train, y_test,
    features_list_model2, "model2"
)

# Train with third set of features
metrics3, models3, importance3 = train_models_with_features(
    X_train_encoded[features_list_model3], X_test_encoded[features_list_model3], y_train, y_test,
    features_list_model3, "model3"
)

# Train with fourth set of features
metrics4, models4, importance4 = train_models_with_features(
    X_train_encoded[features_list_model4], X_test_encoded[features_list_model4], y_train, y_test,
    features_list_model4, "model4"
)

Training models with 5 features
Model suffix: model1
Training RandomForest...
RandomForest - Test Acc: 0.558, Test AUC: 0.572
Time: 26.2s

Training GradientBoosting...
GradientBoosting - Test Acc: 0.555, Test AUC: 0.585
Time: 24.2s

Training LogisticRegression...
LogisticRegression - Test Acc: 0.581, Test AUC: 0.612
Time: 0.3s

FINAL RESULTS:
Model            RandomForest  GradientBoosting  LogisticRegression
Train_Accuracy          0.711             0.627               0.561
Test_Accuracy           0.558             0.555               0.581
Train_Precision         0.692             0.609               0.546
Test_Precision          0.551             0.559               0.593
Train_Recall            0.704             0.602               0.437
Test_Recall             0.557             0.468               0.479
Train_F1                0.698             0.605               0.485
Test_F1                 0.554             0.509               0.530
Train_AUC               0.793             0