In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [2]:
import pandas as pd
import numpy as np
import os

# --- Configuration ---
print("--- [Notebook Pipeline - Model 7] Part 1: Loading & Initial Setup ---")
DATA_FILE_PATH = '/kaggle/input/shodhh/accepted_2007_to_2018Q4.csv'

# --- 1. Load Data ---
try:
    df_nb = pd.read_csv(DATA_FILE_PATH, low_memory=False)
    # Drop rows where loan_status is missing (as done in the notebook)
    df_nb = df_nb.dropna(subset=["loan_status"])
    print(f"✅ Full data loaded. Shape: {df_nb.shape}")
except Exception as e:
    print(f"❌ Error loading data: {e}")
    df_nb = pd.DataFrame()

if not df_nb.empty:
    # --- 2. Sample Data (as done in the notebook) ---
    print("\nSampling 100,000 rows...")
    sampled_df_nb = df_nb.sample(n=100000, random_state=42)
    print(f"Sampled data shape: {sampled_df_nb.shape}")

    # --- 3. Define Target Variable (Notebook's Definition) ---
    print("Defining target variable 'loan_condition_int' (notebook definition)...")
    bad_loan_statuses_nb = [
        "Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off",
        "In Grace Period", "Late (16-30 days)", "Late (31-120 days)"
    ]
    sampled_df_nb['loan_condition_int'] = sampled_df_nb['loan_status'].apply(
        lambda status: 1 if status in bad_loan_statuses_nb else 0
    ).astype(int)
    # Also create the string version for reference if needed later
    sampled_df_nb['loan_condition'] = np.where(sampled_df_nb['loan_condition_int'] == 0, 'Good Loan', 'Bad Loan')
    print("Target variable defined.")
    print("Target distribution in sample:")
    print(sampled_df_nb['loan_condition_int'].value_counts(normalize=True))


    # --- 4. Map emp_length (Notebook's Mapping) ---
    print("\nMapping 'emp_length' to 'emp_length_int'...")
    emp_length_mapping_nb = {
        '10+ years': 10, '9 years': 9, '8 years': 8, '7 years': 7, '6 years': 6,
        '5 years': 5, '4 years': 4, '3 years': 3, '2 years': 2, '1 year': 1,
        '< 1 year': 0.5, 'n/a': 0
    }
    sampled_df_nb['emp_length_int'] = sampled_df_nb['emp_length'].map(emp_length_mapping_nb)

    # --- 5. Map Region (Notebook's Mapping) ---
    print("Mapping 'addr_state' to 'region'...")
    state_to_region_nb = {
        'CA': 'West', 'OR': 'West', 'UT': 'West', 'WA': 'West', 'CO': 'West', 'NV': 'West',
        'AK': 'West', 'MT': 'West', 'HI': 'West', 'WY': 'West', 'ID': 'West', 'AZ': 'SouthWest',
        'TX': 'SouthWest', 'NM': 'SouthWest', 'OK': 'SouthWest', 'GA': 'SouthEast', 'NC': 'SouthEast',
        'VA': 'SouthEast', 'FL': 'SouthEast', 'KY': 'SouthEast', 'SC': 'SouthEast', 'LA': 'SouthEast',
        'AL': 'SouthEast', 'WV': 'SouthEast', 'DC': 'SouthEast', 'AR': 'SouthEast', 'DE': 'SouthEast',
        'MS': 'SouthEast', 'TN': 'SouthEast', 'IL': 'MidWest', 'MO': 'MidWest', 'MN': 'MidWest',
        'OH': 'MidWest', 'WI': 'MidWest', 'KS': 'MidWest', 'MI': 'MidWest', 'SD': 'MidWest',
        'IA': 'MidWest', 'NE': 'MidWest', 'IN': 'MidWest', 'ND': 'MidWest', 'CT': 'NorthEast',
        'NY': 'NorthEast', 'PA': 'NorthEast', 'NJ': 'NorthEast', 'RI': 'NorthEast', 'MA': 'NorthEast',
        'MD': 'NorthEast', 'VT': 'NorthEast', 'NH': 'NorthEast', 'ME': 'NorthEast'
    }
    sampled_df_nb['region'] = sampled_df_nb['addr_state'].map(state_to_region_nb)

    # Store for next step
    model_7_step1_df = sampled_df_nb
    print("\n✅ Initial setup complete.")

else:
    print("❌ Cannot proceed, data loading failed.")

--- [Notebook Pipeline - Model 7] Part 1: Loading & Initial Setup ---
✅ Full data loaded. Shape: (2260668, 151)

Sampling 100,000 rows...
Sampled data shape: (100000, 151)
Defining target variable 'loan_condition_int' (notebook definition)...
Target variable defined.
Target distribution in sample:
loan_condition_int
0    0.86628
1    0.13372
Name: proportion, dtype: float64

Mapping 'emp_length' to 'emp_length_int'...
Mapping 'addr_state' to 'region'...

✅ Initial setup complete.


In [3]:
import pandas as pd
import numpy as np

# Assuming 'model_7_step1_df' is the sampled DataFrame from the previous step

print("--- [Notebook Pipeline - Model 7] Part 2: Data Cleaning (Exclusions) ---")

if 'model_7_step1_df' in locals() or 'model_7_step1_df' in globals():
    df_cleaning_nb = model_7_step1_df.copy()
    original_shape = df_cleaning_nb.shape
    print(f"Shape before cleaning: {original_shape}")

    # --- 1. Remove 'Current' and 'Issued' loan_status ---
    print("\nRemoving 'Current' and 'Issued' loan statuses...")
    initial_rows = len(df_cleaning_nb)
    df_cleaning_nb = df_cleaning_nb[~df_cleaning_nb['loan_status'].isin(['Current', 'Issued'])]
    rows_removed = initial_rows - len(df_cleaning_nb)
    print(f"Removed {rows_removed} rows. New shape: {df_cleaning_nb.shape}")

    # --- 2. Drop columns with > 80% missing values ---
    print("\nDropping columns with > 80% missing values...")
    initial_cols = df_cleaning_nb.shape[1]
    # Keep columns with at least 20% non-missing data
    df_cleaning_nb = df_cleaning_nb.dropna(axis=1, thresh=int(0.20 * len(df_cleaning_nb)))
    cols_dropped = initial_cols - df_cleaning_nb.shape[1]
    print(f"Dropped {cols_dropped} columns. New shape: {df_cleaning_nb.shape}")

    # --- 3. Drop direct indicator columns (as defined in notebook) ---
    print("\nDropping direct indicator columns...")
    direct_indicators_nb = [
        'collection_recovery_fee', 'last_pymnt_amnt', 'out_prncp', 'out_prncp_inv',
        'recoveries', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int',
        'total_rec_late_fee', 'total_rec_prncp', 'next_pymnt_d' # Added next_pymnt_d based on notebook context
    ]
    # Ensure columns exist before dropping
    direct_indicators_to_drop = [col for col in direct_indicators_nb if col in df_cleaning_nb.columns]
    df_cleaning_nb.drop(columns=direct_indicators_to_drop, inplace=True, errors='ignore')
    print(f"Dropped {len(direct_indicators_to_drop)} indicator columns. New shape: {df_cleaning_nb.shape}")


    # --- 4. Drop repetitive/useless object columns (as defined in notebook) ---
    print("\nDropping repetitive/useless object columns...")
    misc_cols_to_drop_nb = [
        'emp_length', # Keeping emp_length_int
        'id', 'emp_title', 'url', 'title', 'zip_code',
        # Also drop loan_status and loan_condition as loan_condition_int is the target
        'loan_status', 'loan_condition',
        # Drop addr_state as region was created
        'addr_state'
    ]
     # Ensure columns exist before dropping
    misc_cols_to_drop = [col for col in misc_cols_to_drop_nb if col in df_cleaning_nb.columns]
    df_cleaning_nb.drop(columns=misc_cols_to_drop, inplace=True, errors='ignore')
    print(f"Dropped {len(misc_cols_to_drop)} misc columns. New shape: {df_cleaning_nb.shape}")

    # Store for next step
    model_7_step2_df = df_cleaning_nb
    print("\n✅ Exclusion steps complete.")

else:
    print("❌ Error: 'model_7_step1_df' not found. Please re-run Part 1.")

--- [Notebook Pipeline - Model 7] Part 2: Data Cleaning (Exclusions) ---
Shape before cleaning: (100000, 155)

Removing 'Current' and 'Issued' loan statuses...
Removed 38822 rows. New shape: (61178, 155)

Dropping columns with > 80% missing values...
Dropped 40 columns. New shape: (61178, 115)

Dropping direct indicator columns...
Dropped 10 indicator columns. New shape: (61178, 105)

Dropping repetitive/useless object columns...
Dropped 9 misc columns. New shape: (61178, 96)

✅ Exclusion steps complete.


In [4]:
import pandas as pd
import numpy as np

# Assuming 'model_7_step2_df' is the DataFrame after the exclusion steps

print("--- [Notebook Pipeline - Model 7] Part 3: Missing Value Imputation ---")

if 'model_7_step2_df' in locals() or 'model_7_step2_df' in globals():
    fillna_df_nb = model_7_step2_df.copy()
    print(f"Shape before imputation: {fillna_df_nb.shape}")
    print(f"Total missing values before: {fillna_df_nb.isnull().sum().sum()}")

    # --- Impute Object Columns (Mode by Region) ---
    print("\nImputing object columns by region mode...")
    object_cols_to_impute = ["last_pymnt_d", "last_credit_pull_d"]
    for column in object_cols_to_impute:
        if column in fillna_df_nb.columns:
            # Calculate mode for each region
            mode_map = fillna_df_nb.groupby("region")[column].agg(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
            # Fill NaNs using the map
            fillna_df_nb[column] = fillna_df_nb.apply(lambda row: mode_map[row['region']] if pd.isnull(row[column]) else row[column], axis=1)
            # Fallback for any regions that might have had only NaNs (fill with overall mode)
            overall_mode = fillna_df_nb[column].mode()[0] if not fillna_df_nb[column].mode().empty else 'Unknown'
            fillna_df_nb[column].fillna(overall_mode, inplace=True)


    # --- Impute Numerical Columns (Median by Region) ---
    print("Imputing specific numerical columns by region median...")
    median_cols_to_impute = ["pub_rec", "total_acc", "emp_length_int"]
    for column in median_cols_to_impute:
        if column in fillna_df_nb.columns:
            fillna_df_nb[column] = fillna_df_nb.groupby("region")[column].transform(lambda x: x.fillna(x.median()))
            # Fallback for any remaining NaNs (e.g., if a whole region was NaN)
            fillna_df_nb[column].fillna(fillna_df_nb[column].median(), inplace=True)


    # --- Impute Numerical Columns (Mean by Region) ---
    print("Imputing specific numerical columns by region mean...")
    mean_cols_to_impute = ["annual_inc", "delinq_2yrs"]
    for column in mean_cols_to_impute:
         if column in fillna_df_nb.columns:
            fillna_df_nb[column] = fillna_df_nb.groupby("region")[column].transform(lambda x: x.fillna(x.mean()))
            # Fallback for any remaining NaNs
            fillna_df_nb[column].fillna(fillna_df_nb[column].mean(), inplace=True)

    # --- Fill Remaining NaNs with Zero (as per notebook) ---
    print("Filling all remaining NaNs with 0...")
    initial_nan_count = fillna_df_nb.isnull().sum().sum()
    fillna_df_nb.fillna(0, inplace=True)
    final_nan_count = fillna_df_nb.isnull().sum().sum()
    print(f"Filled {initial_nan_count - final_nan_count} remaining NaN values.")

    # Store for next step
    model_7_step3_df = fillna_df_nb
    print(f"\n✅ Imputation complete. Final shape: {model_7_step3_df.shape}")
    print(f"Total missing values after: {model_7_step3_df.isnull().sum().sum()}")

else:
    print("❌ Error: 'model_7_step2_df' not found. Please re-run Part 2.")

--- [Notebook Pipeline - Model 7] Part 3: Missing Value Imputation ---
Shape before imputation: (61178, 96)
Total missing values before: 784318

Imputing object columns by region mode...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_df_nb[column].fillna(overall_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fillna_df_nb[column].fillna(overall_mode, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

Imputing specific numerical columns by region median...
Imputing specific numerical columns by region mean...
Filling all remaining NaNs with 0...
Filled 780549 remaining NaN values.

✅ Imputation complete. Final shape: (61178, 96)
Total missing values after: 0


In [5]:
import pandas as pd
import numpy as np

# Assuming 'model_7_step3_df' is the DataFrame after imputation

print("--- [Notebook Pipeline - Model 7] Part 4: Removing Outliers ---")

if 'model_7_step3_df' in locals() or 'model_7_step3_df' in globals():
    RemoveOutlier_df_nb = model_7_step3_df.copy()
    print(f"Shape before removing outliers: {RemoveOutlier_df_nb.shape}")

    # Apply custom thresholds as used in the notebook
    initial_rows = len(RemoveOutlier_df_nb)

    if 'annual_inc' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['annual_inc'] <= 250000]
    if 'dti' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['dti'] <= 50]
    if 'open_acc' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['open_acc'] <= 40]
    if 'total_acc' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['total_acc'] <= 80]
    if 'revol_util' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['revol_util'] <= 120]
    if 'revol_bal' in RemoveOutlier_df_nb.columns:
        RemoveOutlier_df_nb = RemoveOutlier_df_nb[RemoveOutlier_df_nb['revol_bal'] <= 250000]

    # Reset index after filtering
    RemoveOutlier_df_nb.reset_index(drop=True, inplace=True)

    rows_removed = initial_rows - len(RemoveOutlier_df_nb)
    print(f"Removed {rows_removed} rows due to outlier thresholds.")
    print(f"Shape after removing outliers: {RemoveOutlier_df_nb.shape}")

    # Store for next step
    model_7_step4_df = RemoveOutlier_df_nb
    print("\n✅ Outlier removal complete.")

else:
    print("❌ Error: 'model_7_step3_df' not found. Please re-run Part 3.")

--- [Notebook Pipeline - Model 7] Part 4: Removing Outliers ---
Shape before removing outliers: (61178, 96)
Removed 925 rows due to outlier thresholds.
Shape after removing outliers: (60253, 96)

✅ Outlier removal complete.


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
# Ensure category_encoders is installed
try:
    from category_encoders import TargetEncoder
except ImportError:
    print("Warning: category_encoders not found. Attempting install...")
    try:
        import sys
        !{sys.executable} -m pip install category_encoders --quiet
        from category_encoders import TargetEncoder
        print("Installation successful.")
    except Exception as e:
        print(f"Error installing category_encoders: {e}")
        TargetEncoder = None
from sklearn.preprocessing import StandardScaler

# Assuming 'model_7_step4_df' is the DataFrame after outlier removal
# Assuming 'loan_condition_int' is the target column name

print("--- [Notebook Pipeline - Model 7] Part 5: Feature Engineering ---")

if 'model_7_step4_df' in locals() or 'model_7_step4_df' in globals():
    FE_df_nb = model_7_step4_df.copy()
    target_col_nb = 'loan_condition_int'

    # --- 1. Identify Feature Types ---
    original_cols_fe = FE_df_nb.columns.tolist()
    cat_cols_fe = FE_df_nb.select_dtypes(include=['object']).columns.tolist()
    # Exclude the target variable from numerical columns
    num_cols_fe = FE_df_nb.select_dtypes(exclude=['object']).columns.drop(target_col_nb, errors='ignore').tolist()

    # Separate categorical into binary and multi-category
    dual_cat_cols_fe = [col for col in cat_cols_fe if FE_df_nb[col].nunique() <= 2]
    multi_cat_cols_fe = [col for col in cat_cols_fe if FE_df_nb[col].nunique() > 2]

    print(f"Numerical columns found: {len(num_cols_fe)}")
    print(f"Binary categorical columns: {dual_cat_cols_fe}")
    print(f"Multi-categorical columns: {multi_cat_cols_fe}")

    # --- 2. Binary Encoding (get_dummies) ---
    print("\nApplying Binary Encoding (get_dummies)...")
    FE_df_nb = pd.get_dummies(FE_df_nb, columns=dual_cat_cols_fe, drop_first=True)
    # Get names of newly created binary columns (needed later if scaling)
    new_binary_cols = [col for col in FE_df_nb.columns if col not in original_cols_fe and col != target_col_nb]
    print(f"Created {len(new_binary_cols)} new binary columns.")

    # --- 3. Train/Test Split (Stratified) ---
    print("\nSplitting data into training (80%) and test (20%) sets...")
    stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

    # Perform the split
    for train_idx, test_idx in stratified_split.split(FE_df_nb, FE_df_nb[target_col_nb]):
        train_df_nb = FE_df_nb.loc[train_idx]
        test_df_nb = FE_df_nb.loc[test_idx]

    # Separate features and target
    train_y_nb = train_df_nb[[target_col_nb]]
    test_y_nb = test_df_nb[[target_col_nb]]
    train_X_nb = train_df_nb.drop(target_col_nb, axis=1)
    test_X_nb = test_df_nb.drop(target_col_nb, axis=1)

    print(f"Training set shape: X={train_X_nb.shape}, y={train_y_nb.shape}")
    print(f"Test set shape:     X={test_X_nb.shape}, y={test_y_nb.shape}")

    # --- 4. Target Encoding ---
    if TargetEncoder is not None and multi_cat_cols_fe:
        print("\nApplying Target Encoding...")
        # Ensure only existing multi-cat columns are processed
        multi_cat_cols_to_encode = [col for col in multi_cat_cols_fe if col in train_X_nb.columns]
        target_encoder_nb = TargetEncoder(cols=multi_cat_cols_to_encode, smoothing=0.2) # Notebook used smoothing=0.2

        # Fit ONLY on training data
        target_encoder_nb.fit(train_X_nb, train_y_nb.values.ravel()) # .values.ravel() converts to 1D array

        # Transform both train and test data
        train_X_encoded = target_encoder_nb.transform(train_X_nb)
        test_X_encoded = target_encoder_nb.transform(test_X_nb)
        print("Target Encoding applied.")
        # Store list of newly numerical columns from target encoding
        target_encoded_numeric_cols = multi_cat_cols_to_encode
    elif TargetEncoder is None:
        print("❌ Skipping Target Encoding as category_encoders is not available.")
        train_X_encoded = train_X_nb.copy()
        test_X_encoded = test_X_nb.copy()
        target_encoded_numeric_cols = []
    else:
        print("No multi-category columns found for Target Encoding.")
        train_X_encoded = train_X_nb.copy()
        test_X_encoded = test_X_nb.copy()
        target_encoded_numeric_cols = []


    # --- 5. Normalization (StandardScaler) ---
    print("\nApplying Normalization (StandardScaler)...")
    scaler_nb = StandardScaler()

    # Identify all numerical columns for scaling (original + target encoded + new binary)
    # Ensure binary columns from get_dummies are treated as numerical for scaling
    cols_to_scale = num_cols_fe + target_encoded_numeric_cols + new_binary_cols
    # Filter out any columns that might have been dropped or don't exist
    cols_to_scale = [col for col in cols_to_scale if col in train_X_encoded.columns]


    # Fit ONLY on training data
    print(f"Fitting scaler on {len(cols_to_scale)} numerical features...")
    scaler_nb.fit(train_X_encoded[cols_to_scale])

    # Transform both train and test data (in place)
    train_X_scaled = train_X_encoded.copy()
    test_X_scaled = test_X_encoded.copy()

    train_X_scaled[cols_to_scale] = scaler_nb.transform(train_X_encoded[cols_to_scale])
    test_X_scaled[cols_to_scale] = scaler_nb.transform(test_X_encoded[cols_to_scale])
    print("Normalization applied.")

    # Store final datasets for next step
    model_7_step5_train_X = train_X_scaled
    model_7_step5_train_y = train_y_nb
    model_7_step5_test_X = test_X_scaled
    model_7_step5_test_y = test_y_nb

    print(f"\n✅ Feature Engineering complete.")
    print(f"Final training X shape: {model_7_step5_train_X.shape}")
    print(f"Final test X shape: {model_7_step5_test_X.shape}")

else:
    print("❌ Error: 'model_7_step4_df' not found. Please re-run Part 4.")

--- [Notebook Pipeline - Model 7] Part 5: Feature Engineering ---
Numerical columns found: 78
Binary categorical columns: ['term', 'pymnt_plan', 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'debt_settlement_flag']
Multi-categorical columns: ['grade', 'sub_grade', 'home_ownership', 'verification_status', 'issue_d', 'purpose', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d', 'region']

Applying Binary Encoding (get_dummies)...
Created 7 new binary columns.

Splitting data into training (80%) and test (20%) sets...
Training set shape: X=(48202, 95), y=(48202, 1)
Test set shape:     X=(12051, 95), y=(12051, 1)

Applying Target Encoding...
Target Encoding applied.

Applying Normalization (StandardScaler)...
Fitting scaler on 95 numerical features...
Normalization applied.

✅ Feature Engineering complete.
Final training X shape: (48202, 95)
Final test X shape: (12051, 95)


In [7]:
# Install compatible scikit-learn
!pip install scikit-learn==1.5.2 --force-reinstall --quiet
# Install imbalanced-learn
!pip install imbalanced-learn --quiet

print("✅ Installations attempted again. Please check output.")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.3.4 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.3 which is incompatible.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which 

In [8]:
import pandas as pd
import numpy as np
# Ensure imblearn is installed
try:
    from imblearn.under_sampling import RandomUnderSampler
    print("✅ imblearn imported successfully.")
except ImportError:
    print("❌ Error: imbalanced-learn not found or import failed.")
    print("Please ensure imbalanced-learn is installed (e.g., !pip install imbalanced-learn --quiet).")
    RandomUnderSampler = None

# --- Setup ---
# Assuming model_7_step5_train_X, model_7_step5_train_y,
# model_7_step5_test_X, model_7_step5_test_y are from Step 5

print("--- [Notebook Pipeline - Model 7] Part 6: Applying Random Undersampling ---")

# Check if inputs exist and RandomUnderSampler is available
if RandomUnderSampler is None:
    print("❌ Cannot proceed without RandomUnderSampler.")
elif 'model_7_step5_train_X' not in locals() or 'model_7_step5_train_y' not in locals():
    print("❌ Error: Input training data not found. Please ensure Step 5 completed successfully.")
else:
    # --- 1. Initialize Undersampler ---
    rus_nb = RandomUnderSampler(random_state=42, sampling_strategy='auto')

    # --- 2. Apply Undersampling ONLY to Training Data ---
    print(f"Original training data shape: X={model_7_step5_train_X.shape}, y={model_7_step5_train_y.shape}")
    print("Original training target distribution:")
    print(model_7_step5_train_y['loan_condition_int'].value_counts()) # Access column in DataFrame

    try:
        # Pass DataFrame/Series directly
        X_train_undersampled_nb, y_train_undersampled_nb = rus_nb.fit_resample(
            model_7_step5_train_X, model_7_step5_train_y['loan_condition_int'] # Pass Series
        )

        print(f"\nUndersampled training data shape: X={X_train_undersampled_nb.shape}, y={y_train_undersampled_nb.shape}")
        print("Undersampled training target distribution:")
        print(y_train_undersampled_nb.value_counts()) # Now it's a Series

        # --- Store Final Datasets for Modeling ---
        # NOTE: The notebook used these undersampled sets directly for feature selection next
        model_7_step6_train_X = X_train_undersampled_nb
        model_7_step6_train_y = y_train_undersampled_nb # This is now a Series

        # Keep the original test set (encoded and scaled but not undersampled)
        model_7_step6_test_X = model_7_step5_test_X
        model_7_step6_test_y = model_7_step5_test_y # This is still a DataFrame

        print("\n✅ Undersampling complete. Datasets ready for feature selection.")

    except Exception as e:
        print(f"❌ An error occurred during fit_resample: {e}")

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x781950f4e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error
Exception ignored on calling ctypes callback function: <functio

✅ imblearn imported successfully.
--- [Notebook Pipeline - Model 7] Part 6: Applying Random Undersampling ---
Original training data shape: X=(48202, 95), y=(48202, 1)
Original training target distribution:
loan_condition_int
0    37664
1    10538
Name: count, dtype: int64

Undersampled training data shape: X=(21076, 95), y=(21076,)
Undersampled training target distribution:
loan_condition_int
0    10538
1    10538
Name: count, dtype: int64

✅ Undersampling complete. Datasets ready for feature selection.


In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

# --- Setup ---
# Assuming model_7_step6_train_X (undersampled, scaled, encoded training features) exists
# Assuming model_7_step6_test_X (original scaled, encoded test features) exists

print("--- [Notebook Pipeline - Model 7] Part 7: Feature Selection ---")

if 'model_7_step6_train_X' not in locals() or 'model_7_step6_test_X' not in locals():
    print("❌ Error: Input data (model_7_step6_train_X or model_7_step6_test_X) not found.")
    print("Please ensure Step 6 (Undersampling) completed successfully.")
else:
    X_train_fs = model_7_step6_train_X.copy()
    X_test_fs = model_7_step6_test_X.copy() # Apply selection to test set too
    # The notebook implicitly uses the undersampled y_train for wrapper, but we only need X for VarianceThreshold
    # y_train_fs = model_7_step6_train_y # Undersampled training target

    print(f"Shape before VarianceThreshold: {X_train_fs.shape}")

    # --- 1. Apply VarianceThreshold ---
    # The notebook used threshold=1 on the *scaled* data
    selector = VarianceThreshold(threshold=1)
    selector.fit(X_train_fs)

    # Get the names of the features kept by the threshold
    filtered_feature_names = X_train_fs.columns[selector.get_support()]
    X_train_variance_filtered = X_train_fs[filtered_feature_names]

    cols_removed = X_train_fs.shape[1] - X_train_variance_filtered.shape[1]
    print(f"Applied VarianceThreshold(1). Removed {cols_removed} features.")
    print(f"Shape after VarianceThreshold: {X_train_variance_filtered.shape}")

    # --- 2. Select Final Features (Based on Notebook's Wrapper Result) ---
    # The notebook ran a time-consuming wrapper (SFS) and identified these 9 features.
    # We will directly select these for replication purposes.
    vars_final_nb = [
        'delinq_2yrs',
        'last_fico_range_high',
        'last_fico_range_low',
        'acc_now_delinq',
        'open_acc_6m',
        'total_bal_il',
        'il_util',
        'open_rv_12m',
        'all_util'
     ]
    print(f"\nSelecting the final {len(vars_final_nb)} features identified by the notebook's wrapper method...")

    # Ensure these final columns actually exist after variance thresholding
    final_cols_exist = [col for col in vars_final_nb if col in X_train_variance_filtered.columns]

    if len(final_cols_exist) != len(vars_final_nb):
        print(f"⚠️ Warning: Not all expected final features ({vars_final_nb}) were present after VarianceThreshold.")
        print(f"Features missing: {list(set(vars_final_nb) - set(final_cols_exist))}")
        print(f"Proceeding with the {len(final_cols_exist)} available features: {final_cols_exist}")
        final_selected_cols = final_cols_exist
    else:
        final_selected_cols = vars_final_nb
        print("All expected final features found.")


    # Apply the final selection to both train and test sets
    X_train_final_selected = X_train_variance_filtered[final_selected_cols]
    # Also apply to the original test set (after variance filter applied to it)
    X_test_variance_filtered = X_test_fs[filtered_feature_names] # Apply VT filter result
    X_test_final_selected = X_test_variance_filtered[final_selected_cols] # Select final cols


    # Store final datasets for modeling
    model_7_step7_train_X = X_train_final_selected
    model_7_step7_train_y = model_7_step6_train_y # Use the undersampled y from step 6
    model_7_step7_test_X = X_test_final_selected
    model_7_step7_test_y = model_7_step6_test_y # Use the original test y from step 6

    print(f"\n✅ Feature Selection complete.")
    print(f"Final training X shape: {model_7_step7_train_X.shape}")
    print(f"Final test X shape: {model_7_step7_test_X.shape}")

--- [Notebook Pipeline - Model 7] Part 7: Feature Selection ---
Shape before VarianceThreshold: (21076, 95)
Applied VarianceThreshold(1). Removed 37 features.
Shape after VarianceThreshold: (21076, 58)

Selecting the final 9 features identified by the notebook's wrapper method...
All expected final features found.

✅ Feature Selection complete.
Final training X shape: (21076, 9)
Final test X shape: (12051, 9)


In [10]:
!pip install numpy==1.26.4 --force-reinstall --quiet
print("✅ NumPy downgrade attempted. Please check output.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.16.3 which is incompatible.
datasets 4.1.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
onnx 1.18.0 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
ydata-profiling 4.17.0 requires scipy<1.16,>=1.4.1, but you have scipy 1.16.3 which is incompatible.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompat

In [12]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt # Removed
# import seaborn as sns # Removed
import scipy.stats as sps
from sklearn.model_selection import KFold
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                             recall_score, f1_score, roc_auc_score)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import xgboost as xgb # Ensure xgboost is installed if needed: !pip install xgboost --quiet
import time
import warnings
warnings.filterwarnings("ignore") # Suppress convergence warnings etc.

# --- Setup ---
# Assuming model_7_step7_train_X, model_7_step7_train_y exist from Step 7

print("--- [Notebook Pipeline - Model 7] Part 8 (No Plotting): Model Building & CV ---")

# Check if input data exists
if 'model_7_step7_train_X' not in locals() or 'model_7_step7_train_y' not in locals():
    print("❌ Error: Input training data not found. Please ensure Step 7 completed successfully.")
else:
    X = model_7_step7_train_X
    # Ensure y is a Series or 1D array
    if isinstance(model_7_step7_train_y, pd.DataFrame):
        y = model_7_step7_train_y.iloc[:, 0]
    else:
        y = model_7_step7_train_y

    # --- 1. Define Helper Functions (Plotting Removed) ---

    # Removed plot_confusion_matrix function

    def calculate_scores(model, X_trn, y_trn, X_tst, y_tst):
        # ... (rest of the function is the same as before) ...
        model.fit(X_trn, y_trn)
        y_pred = model.predict(X_tst)
        accuracy = accuracy_score(y_tst, y_pred)
        conf_matrix = confusion_matrix(y_tst, y_pred)
        precision = precision_score(y_tst, y_pred, zero_division=0)
        recall = recall_score(y_tst, y_pred, zero_division=0)
        f1 = f1_score(y_tst, y_pred, zero_division=0)
        try:
            y_pred_proba = model.predict_proba(X_tst)[:, 1]
            auc = roc_auc_score(y_tst, y_pred_proba)
            mask = np.array(y_tst).astype(bool)
            churn = y_pred_proba[mask]
            not_churn = y_pred_proba[~mask]
            ks = sps.ks_2samp(churn, not_churn)[0] if len(churn) > 0 and len(not_churn) > 0 else 0.0
        except AttributeError:
            auc = 0.0
            ks = 0.0
            print(f"Warning: {type(model).__name__} does not have predict_proba. AUC/KS set to 0.")
        except Exception as e_proba: # Catch other potential errors during predict_proba
             auc = 0.0
             ks = 0.0
             print(f"Warning: Error during predict_proba for {type(model).__name__}: {e_proba}. AUC/KS set to 0.")


        return accuracy, auc, ks, conf_matrix, precision, recall, f1

    def calculate_cv_scores(model, X, y, cv=5):
        # ... (setup is the same) ...
        y_array = y.values if isinstance(y, pd.Series) else np.array(y)
        kf = KFold(n_splits=cv, shuffle=True, random_state=42)
        accuracy_scores, auc_scores, ks_scores = [], [], []
        conf_matrices, precision_scores, recall_scores, f1_scores = [], [], [], []

        for fold, (train_index, test_index) in enumerate(kf.split(X)):
             # ... (splitting is the same) ...
            X_trn, X_tst = X.iloc[train_index], X.iloc[test_index]
            y_trn, y_tst = y.iloc[train_index] if isinstance(y, pd.Series) else y_array[train_index], \
                           y.iloc[test_index] if isinstance(y, pd.Series) else y_array[test_index]

            try:
                accuracy, auc, ks, conf_matrix, precision, recall, f1 = calculate_scores(model, X_trn, y_trn, X_tst, y_tst)
                accuracy_scores.append(accuracy)
                auc_scores.append(auc)
                ks_scores.append(ks)
                conf_matrices.append(conf_matrix)
                precision_scores.append(precision)
                recall_scores.append(recall)
                f1_scores.append(f1)
            except Exception as e_fold:
                print(f"Error during CV fold {fold+1} for {type(model).__name__}: {e_fold}")
                # Append NaNs or handle error as appropriate
                accuracy_scores.append(np.nan)
                auc_scores.append(np.nan)
                ks_scores.append(np.nan)
                # conf_matrices.append(np.array([[np.nan, np.nan], [np.nan, np.nan]])) # Placeholder
                precision_scores.append(np.nan)
                recall_scores.append(np.nan)
                f1_scores.append(np.nan)


        # Calculate mean confusion matrix, handling potential empty lists or errors
        if conf_matrices:
             # Ensure all matrices have the same shape before averaging (e.g., handle folds that failed)
             valid_matrices = [cm for cm in conf_matrices if isinstance(cm, np.ndarray) and cm.shape == (2, 2)]
             if valid_matrices:
                  mean_conf_matrix = np.mean(valid_matrices, axis=0)
                  print(f"\nMean Confusion Matrix for {type(model).__name__} (GoodLoan=0, BadLoan=1):\n", mean_conf_matrix)
             else:
                  print(f"\nCould not calculate Mean Confusion Matrix for {type(model).__name__} due to errors in folds.")
        else:
             print(f"\nNo Confusion Matrices generated for {type(model).__name__}.")


        # Calculate mean scores, ignoring NaNs from failed folds
        final_accuracy = np.nanmean(accuracy_scores) if accuracy_scores else 0.0
        final_auc = np.nanmean(auc_scores) if auc_scores else 0.0
        final_ks = np.nanmean(ks_scores) if ks_scores else 0.0
        final_precision = np.nanmean(precision_scores) if precision_scores else 0.0
        final_recall = np.nanmean(recall_scores) if recall_scores else 0.0
        final_f1 = np.nanmean(f1_scores) if f1_scores else 0.0

        return final_accuracy, final_auc, final_ks, final_precision, final_recall, final_f1


    def fit_models_summary(models, X, y, cv=5):
        # ... (rest of the function is the same as before) ...
        summary = pd.DataFrame(columns=['accuracy', 'auc', 'ks', 'precision', 'recall', 'f1'])
        for name, model in models.items():
            print(f"--- Processing: {name} ---")
            start_time = time.time()
            try:
                accuracy, auc, ks, precision, recall, f1 = calculate_cv_scores(model, X, y, cv=cv)
                summary.loc[name] = [accuracy, auc, ks, precision, recall, f1]
            except Exception as e_model:
                 print(f"!!!!! Error processing model {name}: {e_model} !!!!!")
                 summary.loc[name] = [np.nan] * 6 # Add row with NaNs

            end_time = time.time()
            print(f"{name} processed in {end_time - start_time:.2f} seconds.")
            print("-" * (len(name) + 20))
        return summary.sort_values(by='f1', ascending=False)


    # --- 2. Define Models (Same as before) ---
    mss=60
    msl=int(mss/2)

    models_nb = {
         'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
         'Decision Tree': DecisionTreeClassifier(max_depth=10, min_samples_split=mss, min_samples_leaf=msl, random_state=42),
         'K Nearest Neighbors': KNeighborsClassifier(n_neighbors=20, n_jobs=-1),
         'Random Forest': RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42, n_jobs=-1),
         'Gaussian Naive Bayes': GaussianNB(var_smoothing=1),
         'Light GBM': lgb.LGBMClassifier(n_estimators=50, max_depth=3, random_state=42, n_jobs=-1, verbose=-1),
         'XGBoost': xgb.XGBClassifier(n_estimators=50, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss'),
         'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42),
         "Neural Network": MLPClassifier(hidden_layer_sizes=(10,10), random_state=42, max_iter=500)
    }

    # --- 3. Run Cross-Validation ---
    print("\nStarting 5-fold cross-validation for all models (plotting disabled)...")
    start_cv_time = time.time()
    baseline_summary_nb = fit_models_summary(models_nb, X, y, cv=5)
    end_cv_time = time.time()
    print(f"\nCross-validation finished in {end_cv_time - start_cv_time:.2f} seconds.")

    # Display summary table
    print("\n--- Cross-Validation Summary (Sorted by F1-Score) ---")
    print(baseline_summary_nb)

    # Store results
    model_7_cv_summary = baseline_summary_nb


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.3.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-package

AttributeError: _ARRAY_API not found

--- [Notebook Pipeline - Model 7] Part 8 (No Plotting): Model Building & CV ---

Starting 5-fold cross-validation for all models (plotting disabled)...
--- Processing: Logistic Regression ---

Mean Confusion Matrix for LogisticRegression (GoodLoan=0, BadLoan=1):
 [[1850.2  257.4]
 [ 301.2 1806.4]]
Logistic Regression processed in 0.46 seconds.
---------------------------------------
--- Processing: Decision Tree ---

Mean Confusion Matrix for DecisionTreeClassifier (GoodLoan=0, BadLoan=1):
 [[1797.4  310.2]
 [ 257.6 1850. ]]
Decision Tree processed in 0.44 seconds.
---------------------------------
--- Processing: K Nearest Neighbors ---

Mean Confusion Matrix for KNeighborsClassifier (GoodLoan=0, BadLoan=1):
 [[1972.8  134.8]
 [ 937.2 1170.4]]
K Nearest Neighbors processed in 3.97 seconds.
---------------------------------------
--- Processing: Random Forest ---

Mean Confusion Matrix for RandomForestClassifier (GoodLoan=0, BadLoan=1):
 [[1804.   303.6]
 [ 256.2 1851.4]]
Random Forest

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                             recall_score, f1_score, roc_auc_score)
import scipy.stats as sps
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
import xgboost as xgb
import time
import warnings
warnings.filterwarnings("ignore")

# --- Setup ---
# Assuming model_7_step7_train_X, model_7_step7_train_y (undersampled train)
# Assuming model_7_step7_test_X, model_7_step7_test_y (original test) exist from previous steps

print("--- [Notebook Pipeline - Model 7] Part 9: Final Evaluation on Test Set ---")

# Check if data exists
if ('model_7_step7_train_X' not in locals() or 'model_7_step7_train_y' not in locals() or
    'model_7_step7_test_X' not in locals() or 'model_7_step7_test_y' not in locals()):
    print("❌ Error: Input data not found. Please ensure previous steps ran successfully.")
else:
    X_trn_final = model_7_step7_train_X
    # Ensure y is Series/1D array
    y_trn_final = model_7_step7_train_y if isinstance(model_7_step7_train_y, pd.Series) else pd.Series(model_7_step7_train_y)
    X_tst_final = model_7_step7_test_X
    # Ensure y is Series/1D array from DataFrame
    y_tst_final = model_7_step7_test_y.iloc[:, 0] if isinstance(model_7_step7_test_y, pd.DataFrame) else pd.Series(model_7_step7_test_y)


    # --- 1. Define Helper Function for Test Set Scores ---
    def calculate_test_scores(model, X_trn, y_trn, X_tst, y_tst):
        # Returns: accuracy, auc, ks, conf_matrix, precision, recall, f1
        # Uses the same logic as the calculate_scores from CV step
        model.fit(X_trn, y_trn)
        y_pred = model.predict(X_tst)
        accuracy = accuracy_score(y_tst, y_pred)
        conf_matrix = confusion_matrix(y_tst, y_pred)
        precision = precision_score(y_tst, y_pred, zero_division=0)
        recall = recall_score(y_tst, y_pred, zero_division=0)
        f1 = f1_score(y_tst, y_pred, zero_division=0)
        try:
            y_pred_proba = model.predict_proba(X_tst)[:, 1]
            auc = roc_auc_score(y_tst, y_pred_proba)
            mask = np.array(y_tst).astype(bool)
            churn = y_pred_proba[mask]
            not_churn = y_pred_proba[~mask]
            ks = sps.ks_2samp(churn, not_churn)[0] if len(churn) > 0 and len(not_churn) > 0 else 0.0
        except AttributeError:
             auc, ks = 0.0, 0.0
        except Exception as e_proba:
             auc, ks = 0.0, 0.0
             print(f"Warning: Prob pred error for {type(model).__name__}: {e_proba}. AUC/KS=0.")

        return accuracy, auc, ks, conf_matrix, precision, recall, f1

    def fit_first_level_preds(models, X_trn, y_trn, X_tst):
        # Generates predictions for stacking (similar to notebook function)
        X2_trn = np.zeros((len(X_trn), 2 * len(models)))
        X2_tst = np.zeros((len(X_tst), 2 * len(models)))

        for i, (name, model) in enumerate(models.items()):
            print(f"  Generating stacking preds for: {name}")
            try:
                model.fit(X_trn, y_trn)
                y_trn_pred_proba = model.predict_proba(X_trn)
                y_tst_pred_proba = model.predict_proba(X_tst)
                X2_trn[:, i*2:(i+1)*2] = y_trn_pred_proba
                X2_tst[:, i*2:(i+1)*2] = y_tst_pred_proba
            except Exception as e_stack_fit:
                 print(f"  Error fitting/predicting {name} for stacking: {e_stack_fit}")
                 # Fill with NaNs or zeros if a model fails
                 X2_trn[:, i*2:(i+1)*2] = np.nan
                 X2_tst[:, i*2:(i+1)*2] = np.nan

        return X2_trn, X2_tst


    # --- 2. Define Models (Same base models as CV) ---
    mss=60
    msl=int(mss/2)
    models_nb_final = {
         'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
         'Decision Tree': DecisionTreeClassifier(max_depth=10, min_samples_split=mss, min_samples_leaf=msl, random_state=42),
         'K Nearest Neighbors': KNeighborsClassifier(n_neighbors=20, n_jobs=-1),
         'Random Forest': RandomForestClassifier(n_estimators=20, max_depth=10, random_state=42, n_jobs=-1),
         'Gaussian Naive Bayes': GaussianNB(var_smoothing=1),
         'Light GBM': lgb.LGBMClassifier(n_estimators=50, max_depth=3, random_state=42, n_jobs=-1, verbose=-1),
         'XGBoost': xgb.XGBClassifier(n_estimators=50, max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss'),
         'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42),
         "Neural Network": MLPClassifier(hidden_layer_sizes=(10,10), random_state=42, max_iter=500)
    }

    # --- 3. Evaluate Base Models on Test Set ---
    print("\n--- Evaluating Base Models on Hold-Out Test Set ---")
    holdout_summary_nb = pd.DataFrame(columns=['accuracy', 'auc', 'ks', 'precision', 'recall', 'f1'])
    all_conf_matrices = {} # Store confusion matrices
    start_base_time = time.time()

    for name, model in models_nb_final.items():
        print(f"Processing: {name}")
        try:
            accuracy, auc, ks, conf_matrix, precision, recall, f1 = calculate_test_scores(model, X_trn_final, y_trn_final, X_tst_final, y_tst_final)
            holdout_summary_nb.loc[name] = [accuracy, auc, ks, precision, recall, f1]
            all_conf_matrices[name] = conf_matrix
            print(f"  {name}: F1={f1:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")
        except Exception as e_base_eval:
             print(f"!!!!! Error evaluating model {name}: {e_base_eval} !!!!!")
             holdout_summary_nb.loc[name] = [np.nan] * 6
             all_conf_matrices[name] = np.array([[np.nan]*2]*2)

    end_base_time = time.time()
    print(f"\nBase model evaluation finished in {end_base_time - start_base_time:.2f} seconds.")


    # --- 4. Evaluate Bagging Models on Test Set ---
    print("\n--- Evaluating Bagging Models on Hold-Out Test Set ---")
    bagging_models = {}
    start_bagging_time = time.time()

    # Use a subset of models for bagging as per the notebook's final list if needed
    models_for_bagging = {k:v for k,v in models_nb_final.items() if k not in ['K Nearest Neighbors', 'Gaussian Naive Bayes']} # Example subset

    for name, model in models_for_bagging.items():
        print(f"Processing Bagging: {name}")
        try:
            bagging_model = BaggingClassifier(base_estimator=model, n_estimators=10, random_state=42, n_jobs=-1)
            accuracy, auc, ks, conf_matrix, precision, recall, f1 = calculate_test_scores(bagging_model, X_trn_final, y_trn_final, X_tst_final, y_tst_final)
            bagging_name = name + ' (Bagging)'
            holdout_summary_nb.loc[bagging_name] = [accuracy, auc, ks, precision, recall, f1]
            all_conf_matrices[bagging_name] = conf_matrix
            print(f"  {bagging_name}: F1={f1:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")
        except Exception as e_bagging_eval:
             print(f"!!!!! Error evaluating Bagging model {name}: {e_bagging_eval} !!!!!")
             holdout_summary_nb.loc[name + ' (Bagging)'] = [np.nan] * 6
             all_conf_matrices[name + ' (Bagging)'] = np.array([[np.nan]*2]*2)


    end_bagging_time = time.time()
    print(f"\nBagging model evaluation finished in {end_bagging_time - start_bagging_time:.2f} seconds.")


    # --- 5. Evaluate Stacking Model on Test Set ---
    print("\n--- Evaluating Stacking Model on Hold-Out Test Set ---")
    start_stacking_time = time.time()
    # Use the same subset of models for stacking's first level
    models_for_stacking = models_for_bagging # Using the same subset for consistency

    print("Generating L1 predictions for stacking...")
    X2_trn_stack, X2_tst_stack = fit_first_level_preds(models_for_stacking, X_trn_final, y_trn_final, X_tst_final)

    # Check for NaNs introduced by failed base models
    if np.isnan(X2_trn_stack).any() or np.isnan(X2_tst_stack).any():
        print("Warning: NaNs found in stacking features. Attempting imputation with 0.")
        X2_trn_stack = np.nan_to_num(X2_trn_stack, nan=0.0) # Replace NaN with 0
        X2_tst_stack = np.nan_to_num(X2_tst_stack, nan=0.0)

    # Define and evaluate the L2 model (XGBoost in the notebook)
    stack_model_final = xgb.XGBClassifier(max_depth=3, random_state=42, use_label_encoder=False, eval_metric='logloss')
    print("Evaluating final stacking model (XGBoost)...")
    try:
        accuracy, auc, ks, conf_matrix, precision, recall, f1 = calculate_test_scores(stack_model_final, X2_trn_stack, y_trn_final, X2_tst_stack, y_tst_final)
        stacking_name = 'Stacking Model (XGB)'
        holdout_summary_nb.loc[stacking_name] = [accuracy, auc, ks, precision, recall, f1]
        all_conf_matrices[stacking_name] = conf_matrix
        print(f"  {stacking_name}: F1={f1:.4f}, Recall={recall:.4f}, AUC={auc:.4f}")
    except Exception as e_stack_eval:
         print(f"!!!!! Error evaluating Stacking model: {e_stack_eval} !!!!!")
         holdout_summary_nb.loc['Stacking Model (XGB)'] = [np.nan] * 6
         all_conf_matrices['Stacking Model (XGB)'] = np.array([[np.nan]*2]*2)


    end_stacking_time = time.time()
    print(f"\nStacking model evaluation finished in {end_stacking_time - start_stacking_time:.2f} seconds.")


    # --- 6. Display Final Summary ---
    print("\n--- Final Hold-Out Test Set Summary (Sorted by F1-Score) ---")
    final_summary_sorted = holdout_summary_nb.sort_values(by='f1', ascending=False)
    print(final_summary_sorted)

    # Optionally display confusion matrices
    # print("\n--- Confusion Matrices (Test Set) ---")
    # for name, matrix in all_conf_matrices.items():
    #     print(f"\n{name}:")
    #     print(matrix)

--- [Notebook Pipeline - Model 7] Part 9: Final Evaluation on Test Set ---

--- Evaluating Base Models on Hold-Out Test Set ---
Processing: Logistic Regression
  Logistic Regression: F1=0.7472, Recall=0.8538, AUC=0.9283
Processing: Decision Tree
  Decision Tree: F1=0.7320, Recall=0.8834, AUC=0.9259
Processing: K Nearest Neighbors
  K Nearest Neighbors: F1=0.6015, Recall=0.5224, AUC=0.7587
Processing: Random Forest
  Random Forest: F1=0.7385, Recall=0.8812, AUC=0.9297
Processing: Gaussian Naive Bayes
  Gaussian Naive Bayes: F1=0.6761, Recall=0.6416, AUC=0.9096
Processing: Light GBM
  Light GBM: F1=0.7466, Recall=0.8713, AUC=0.9317
Processing: XGBoost
  XGBoost: F1=0.7446, Recall=0.8740, AUC=0.9316
Processing: Gradient Boosting
  Gradient Boosting: F1=0.7459, Recall=0.8713, AUC=0.9317
Processing: Neural Network
  Neural Network: F1=0.7419, Recall=0.8812, AUC=0.9320

Base model evaluation finished in 11.01 seconds.

--- Evaluating Bagging Models on Hold-Out Test Set ---
Processing Bagging