In [None]:
# ===== IMPORTS & CONFIGURATION =====
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# ===== DATA INGESTION =====
def load_data(file_path):
    """Simulate data loading with synthetic data"""
    # Create synthetic data based on PDF specifications
    n_samples = 327741
    n_features = 139
    fraud_ratio = 0.108
    
    # Create synthetic features based on PDF column names
    data = pd.DataFrame(np.random.randn(n_samples, n_features), 
                       columns=[f'feature_{i}' for i in range(n_features)])
    
    # Add known columns from PDF
    known_columns = [
        'ACT_AGE', 'LIMIT', 'OUTS', 'ACT_INESTRUM', 'TERURE', 'LOIN', 
        'INSTALANT', 'ST_FLG', 'AGE', 'VENTAGE', 'KYC_SCR', 'CREDIT_HISTORY_LENGTH',
        'NO_OF_INQUIRIES', 'TACQUE_BAND', 'AGREG_GROUP', 'SI_FLG', 'LOCKER_HLDR_IND',
        'UID_FLG', 'KYC_FLG', 'INB_FLG', 'EKYCFLG', 'LATEST_CR_DAYS', 'ALL_LON_MAX_IRAC',
        'LOAN_TENURE', 'LAST_1_YR_R64', 'LAST_3_YR_R64', 'CHST_NO_OF_TIMES_NPA',
        'FIRST_NPA_TENURE', 'LATEST_NPA_TENURE', 'NO_YRS_NPA', 'NO_ENG', 'CRIFT_33',
        'CRIFT_44', 'CRIFT_22', 'SIXAMTHAYGOTTO', 'SIXAMTHISCR', 'SIXAMTHISDR',
        'SIXAMTHOUTSTANGBAL', 'SIXAMTHAYGMTD', 'FIVEMATHISCR', 'FIVEMATHAYGMTD',
        'FIVEMATHOUTSTANGBAL', 'FIVEMATHISDR'
    ]
    
    for col in known_columns:
        if col not in data.columns:
            data[col] = np.random.rand(n_samples)
    
    # Add target variable
    fraud_cases = int(n_samples * fraud_ratio)
    data['FRAUD'] = [1] * fraud_cases + [0] * (n_samples - fraud_cases)
    np.random.shuffle(data['FRAUD'].values)
    
    return data

# ===== DATA CLEANING =====
def clean_data(data):
    """Clean data based on PDF specifications"""
    # Drop high-missing columns (>50%)
    missing_threshold = len(data) * 0.5
    missing_counts = data.isnull().sum()
    cols_to_drop = missing_counts[missing_counts > missing_threshold].index.tolist()
    
    # Add known columns to drop from PDF
    cols_to_drop += ['LAST_1_YR_R64', 'LAST_3_YR_R64', 'CHST_NO_OF_TIMES_NPA',
                    'FIRST_NPA_TENURE', 'LATEST_NPA_TENURE', 'NO_YRS_NPA', 'NO_ENG']
    
    # Remove duplicates
    cols_to_drop = list(set(cols_to_drop))
    data_cleaned = data.drop(columns=cols_to_drop, errors='ignore')
    
    # Drop redundant columns (as per PDF)
    data_cleaned = data_cleaned.drop(columns=['AGREG_GROUP', 'Unique ID'], errors='ignore')
    
    return data_cleaned

# ===== FEATURE ENGINEERING =====
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.flag_columns = ['SI_FLG', 'LOCKER_HLDR_IND', 'UID_FLG', 'KYC_FLG', 'INB_FLG', 'EKYC_FLG']
        self.balance_columns = [f'BALANCE_MONTH_{i}' for i in range(1, 13)]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Create TOTAL_FLAGS feature
        X['TOTAL_FLAGS'] = X[self.flag_columns].sum(axis=1)
        
        # Create split-averaged balances
        if all(col in X.columns for col in self.balance_columns):
            X['avg_balance_early'] = X[self.balance_columns[:5]].mean(axis=1)
            X['avg_balance_late'] = X[self.balance_columns[5:]].mean(axis=1)
        
        # Create LATEST_CR_DAYS feature
        if 'LATEST_CR_DAYS' in X.columns:
            X['high_latest_cr_days'] = X['LATEST_CR_DAYS'].apply(
                lambda x: 1 if 200 < x < 500 else 0)
        
        # Create ALL_LON_MAX_IRAC feature
        if 'ALL_LON_MAX_IRAC' in X.columns:
            X['non_irac_3_flag'] = X['ALL_LON_MAX_IRAC'].apply(
                lambda x: 1 if x != 3 else 0)
        
        # Create LOAN_TENURE feature
        if 'LOAN_TENURE' in X.columns:
            X['short_loan_tenure_flag'] = X['LOAN_TENURE'].apply(
                lambda x: 1 if x <= 1096 else 0)
        
        # Time-series feature aggregation
        ts_features = ['SCR', 'SDR', 'OUTSTANGBAL', 'AVGMTD', 'AVGQTD', 'AVGYTD']
        for feature in ts_features:
            cols = [f'{feature}_MONTH_{i}' for i in range(1, 13)]
            if all(col in X.columns for col in cols):
                ts_data = X[cols]
                X[f'{feature}_MEAN'] = ts_data.mean(axis=1)
                X[f'{feature}_MIN'] = ts_data.min(axis=1)
                X[f'{feature}_MAX'] = ts_data.max(axis=1)
                X[f'{feature}_STD'] = ts_data.std(axis=1)
                X[f'{feature}_FIRST'] = ts_data.iloc[:, 0]
                X[f'{feature}_LAST'] = ts_data.iloc[:, -1]
                X[f'{feature}_DIFF'] = X[f'{feature}_LAST'] - X[f'{feature}_FIRST']
                
                # Calculate slope using linear regression
                for idx, row in ts_data.iterrows():
                    slope = np.polyfit(range(12), row.values, 1)[0]
                    X.at[idx, f'{feature}_SLOPE'] = slope
        
        # Convert account age and credit history to months
        if 'ACCOUNT_AGE' in X.columns:
            X['ACCOUNT_AGE_MONTHS'] = X['ACCOUNT_AGE'] * 12
            
        if 'CREDIT_HISTORY_LENGTH' in X.columns:
            # Assuming format "Xyrs & Ymon"
            X['CREDIT_HISTORY_MONTHS'] = X['CREDIT_HISTORY_LENGTH'].apply(
                lambda s: int(s.split('yrs')[0])*12 + int(s.split('&')[1].split('mon')[0]))
        
        return X

# ===== STAGE 1: HIGH-RECALL FILTERING =====
def stage1_filtering(X, y, threshold=0.3):
    """Filter dataset using high-recall XGBoost model"""
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42)
    
    # Train XGBoost model
    xgb = XGBClassifier(scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
                        eval_metric='logloss',
                        random_state=42)
    xgb.fit(X_train, y_train)
    
    # Get probabilities and apply threshold
    y_proba = xgb.predict_proba(X_test)[:, 1]
    y_pred = (y_proba > threshold).astype(int)
    
    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    recall = tp / (tp + fn)
    fraud_miss_rate = fn / (tp + fn)
    non_fraud_accuracy = tn / (tn + fp)
    
    print(f"Stage 1 Performance (Threshold={threshold}):")
    print(f"Recall: {recall:.4f}, Fraud Miss Rate: {fraud_miss_rate:.4f}")
    print(f"Non-Fraud Accuracy: {non_fraud_accuracy:.4f}")
    print(f"Filtered Subset Size: {tp + fp} ({((tp+fp)/len(X_test))*100:.2f}% of test set)")
    
    # Filter training data for Stage 2
    train_proba = xgb.predict_proba(X_train)[:, 1]
    train_filtered_idx = np.where(train_proba > threshold)[0]
    
    return X_train.iloc[train_filtered_idx], y_train.iloc[train_filtered_idx], xgb

# ===== STAGE 2: NEURAL NETWORK =====
def build_nn_model(input_dim):
    """Create neural network architecture"""
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    
    return model

def evaluate_thresholds(y_true, y_proba):
    """Evaluate model at different thresholds"""
    results = []
    for threshold in [0.5, 0.6, 0.7, 0.8]:
        y_pred = (y_proba > threshold).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        results.append({
            'threshold': threshold,
            'TP': tp,
            'FP': fp,
            'precision': precision,
            'recall': recall
        })
    
    return pd.DataFrame(results)

# ===== ENSEMBLE WITH UNCERTAINTY HANDLING =====
def ensemble_voting(X_train, y_train, X_test):
    """Ensemble model with uncertainty handling"""
    models = [
        ('xgb', XGBClassifier(random_state=42)),
        ('mlp', MLPClassifier(hidden_layer_sizes=(100,), random_state=42)),
        ('lgbm', LGBMClassifier(random_state=42)),
        ('catboost', CatBoostClassifier(verbose=0, random_state=42)),
        ('logreg', LogisticRegression(max_iter=1000, random_state=42)),
        ('extratrees', ExtraTreesClassifier(random_state=42)),
        ('randomforest', RandomForestClassifier(random_state=42))
    ]
    
    # Train individual models
    trained_models = {}
    for name, model in models:
        model.fit(X_train, y_train)
        trained_models[name] = model
    
    # Generate predictions with uncertainty
    all_preds = []
    for name, model in trained_models.items():
        y_proba = model.predict_proba(X_test)[:, 1]
        all_preds.append(y_proba)
    
    # Apply dynamic thresholding
    final_preds = np.zeros(X_test.shape[0])
    for i in range(X_test.shape[0]):
        model_preds = [preds[i] for preds in all_preds]
        sorted_preds = np.sort(model_preds)
        
        # Classify based on percentiles
        if sorted_preds[4] >= np.percentile(model_preds, 80):  # Top 20% (5th model in sorted list)
            final_preds[i] = 1  # Fraud
        elif sorted_preds[2] <= np.percentile(model_preds, 20):  # Bottom 20% (3rd model)
            final_preds[i] = 0  # Non-fraud
        else:
            final_preds[i] = 2  # Uncertain
    
    return final_preds, trained_models

# ===== MAIN EXECUTION PIPELINE =====
def main():
    # Step 1: Data Ingestion
    print("Loading data...")
    data = load_data("PSB_Hackathon_data.csv")
    
    # Step 2: Data Cleaning
    print("Cleaning data...")
    data_cleaned = clean_data(data)
    
    # Separate features and target
    X = data_cleaned.drop(columns=['FRAUD'])
    y = data_cleaned['FRAUD']
    
    # Step 3: Feature Engineering
    print("Engineering features...")
    feature_engineer = FeatureEngineer()
    X_engineered = feature_engineer.fit_transform(X)
    
    # Step 4: Preprocessing
    # Identify categorical and numerical columns
    cat_cols = X_engineered.select_dtypes(include=['object', 'category']).columns
    num_cols = X_engineered.select_dtypes(include=['number']).columns
    
    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='median'), num_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
        ])
    
    X_preprocessed = preprocessor.fit_transform(X_engineered)
    
    # Step 5: Stage 1 - High-Recall Filtering
    print("Stage 1: High-recall filtering...")
    X_filtered, y_filtered, stage1_model = stage1_filtering(
        X_preprocessed, y, threshold=0.3)
    
    # Save Stage 1 model
    joblib.dump(stage1_model, 'stage1_xgb_model.pkl')
    
    # Step 6: Stage 2 - Neural Network
    print("\nStage 2: Neural Network Training...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_filtered, y_filtered, test_size=0.3, stratify=y_filtered, random_state=42)
    
    # Add XGBoost probabilities as feature
    xgb_proba = stage1_model.predict_proba(X_train)[:, 1]
    X_train = np.hstack([X_train, xgb_proba.reshape(-1, 1)])
    
    # Build and train NN
    nn_model = build_nn_model(X_train.shape[1])
    history = nn_model.fit(X_train, y_train, 
                          epochs=50, 
                          batch_size=256, 
                          validation_split=0.2,
                          verbose=1)
    
    # Evaluate on test data
    test_xgb_proba = stage1_model.predict_proba(X_test)[:, 1]
    X_test_nn = np.hstack([X_test, test_xgb_proba.reshape(-1, 1)])
    y_proba_nn = nn_model.predict(X_test_nn)
    
    # Evaluate at different thresholds
    threshold_results = evaluate_thresholds(y_test, y_proba_nn)
    print("\nNeural Network Threshold Performance:")
    print(threshold_results)
    
    # Save NN model
    nn_model.save('stage2_nn_model.h5')
    
    # Step 7: Ensemble with Uncertainty Handling
    print("\nTraining Ensemble with Uncertainty Handling...")
    # Use filtered data without XGB proba feature
    X_train_ensemble, X_test_ensemble, y_train_ensemble, y_test_ensemble = train_test_split(
        X_filtered, y_filtered, test_size=0.3, stratify=y_filtered, random_state=42)
    
    ensemble_preds, ensemble_models = ensemble_voting(
        X_train_ensemble, y_train_ensemble, X_test_ensemble)
    
    # Analyze results
    confident_preds = ensemble_preds[ensemble_preds != 2]
    confident_true = y_test_ensemble[ensemble_preds != 2]
    uncertain_count = np.sum(ensemble_preds == 2)
    
    print(f"\nEnsemble Results:")
    print(f"Confident predictions: {len(confident_preds)}")
    print(f"Uncertain cases: {uncertain_count} ({uncertain_count/len(ensemble_preds)*100:.2f}%)")
    print("\nConfident Prediction Performance:")
    print(classification_report(confident_true, confident_preds))
    
    # Save ensemble models
    joblib.dump(ensemble_models, 'ensemble_models.pkl')
    
    print("\nPipeline execution complete!")

if __name__ == "__main__":
    main()