In [1]:
pip install numpy pandas matplotlib seaborn scikit-learn imbalanced-learn xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [316]:
# Numerical and Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB

# Handling Imbalanced Datasets
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Gradient Boosting Algorithms
import xgboost as xgb
import lightgbm as lgb
import catboost as cb



In [318]:
file_path = '/Users/liamr/Downloads/application_data.csv'

data = pd.read_csv(file_path)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None


In [320]:
# Print columns with missing values
missing_columns = data.columns[data.isnull().any()]
print(data[missing_columns].isnull().sum())

AMT_ANNUITY                       12
AMT_GOODS_PRICE                  278
NAME_TYPE_SUITE                 1292
OWN_CAR_AGE                   202929
OCCUPATION_TYPE                96391
                               ...  
AMT_REQ_CREDIT_BUREAU_DAY      41519
AMT_REQ_CREDIT_BUREAU_WEEK     41519
AMT_REQ_CREDIT_BUREAU_MON      41519
AMT_REQ_CREDIT_BUREAU_QRT      41519
AMT_REQ_CREDIT_BUREAU_YEAR     41519
Length: 67, dtype: int64


In [322]:
df_cleaned = data.dropna(axis=1)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 55 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   NAME_INCOME_TYPE             307511 non-null  object 
 10  NAME_EDUCATION_TYPE          307511 non-null  object 
 11  NAME_FAMILY_STATUS           307511 non-null  object 
 12  NAME_HOUSING_TYPE            307511 non-null  object 
 13 

In [324]:
# Drop all columns with dtype 'object'
df_cleaned = df_cleaned.drop(df_cleaned.select_dtypes(include=['object']).columns, axis=1)
X = df_cleaned.drop(columns=['TARGET'])
y = df_cleaned['TARGET']
print("Column Names:", X.columns.tolist())
X.head()

Column Names: ['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']


Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,100002,0,202500.0,406597.5,0.018801,-9461,-637,-3648.0,-2120,1,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,270000.0,1293502.5,0.003541,-16765,-1188,-1186.0,-291,1,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,67500.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,1,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,135000.0,312682.5,0.008019,-19005,-3039,-9833.0,-2437,1,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,121500.0,513000.0,0.028663,-19932,-3038,-4311.0,-3458,1,...,0,0,0,0,0,0,0,0,0,0


In [326]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

scaled_X


array([[-1.73342255, -0.57753784,  0.14212925, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [-1.73341283, -0.57753784,  0.42679193, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [-1.7334031 , -0.57753784, -0.4271961 , ..., -0.02440195,
        -0.02252901, -0.01830463],
       ...,
       [ 1.73239096, -0.57753784, -0.06662338, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [ 1.73240069, -0.57753784,  0.00928667, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [ 1.73241042, -0.57753784, -0.04764587, ..., -0.02440195,
        -0.02252901, -0.01830463]])

In [328]:
#undersampling majority class
# Define the undersampler
undersampler = RandomUnderSampler(random_state=42)

# Resample the dataset
X_resampled, y_resampled = undersampler.fit_resample(scaled_X, y)

# Check new class distribution
print("Resampled Class Distribution:")
print(pd.Series(y_resampled).value_counts())


Resampled Class Distribution:
TARGET
0    24825
1    24825
Name: count, dtype: int64


In [368]:
#Perform PCA
pca = PCA(n_components=30)  # Retain 2 components
X_resampled_pca = pca.fit_transform(X_resampled)

#Explained variance
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)
print("Total Explained Variance:", explained_variance.sum())

#Explained variance >90% with 30 components

print(f"Shape after PCA - X_resampled_pca: {X_resampled_pca.shape}")
print(f"Shape after PCA - Y_resampled: {y_resampled.shape}")
assert X_resampled_pca.shape[0] == len(y_resampled), "Mismatch in number of samples!"


Explained Variance Ratio: [0.10737567 0.07717819 0.06571915 0.05634468 0.04660801 0.04046183
 0.03185656 0.03118423 0.02981871 0.02828035 0.02510227 0.02349515
 0.02258265 0.022061   0.02165532 0.02111891 0.02102432 0.02074289
 0.02019629 0.0200047  0.01850453 0.01829765 0.0180798  0.01771601
 0.01754289 0.01704449 0.01631816 0.0160218  0.01571059 0.01568829]
Total Explained Variance: 0.9037351020954741
Shape after PCA - X_resampled_pca: (49650, 30)
Shape after PCA - Y_resampled: (49650,)


In [370]:
# First split: Training (70%) and temporary (30%)
X_train, X_test, y_train, y_test = train_test_split(X_resampled_pca, y_resampled, test_size=0.3, random_state=42)
# Second split: Validation (20%) and Testing (10%)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.3333, random_state=42)

# Check the sizes of each split
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")


Training set: (34755, 30), (34755,)
Validation set: (9930, 30), (9930,)
Testing set: (4965, 30), (4965,)


In [372]:
# Initialize the KNN model with 5 neighbors (default)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn.fit(X_train, y_train)

y_val_pred = knn.predict(X_val)

y_test_pred = knn.predict(X_test)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Evaluate the model on the testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))



Validation Accuracy: 0.5384692849949647
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.53      0.53      4909
           1       0.54      0.55      0.55      5021

    accuracy                           0.54      9930
   macro avg       0.54      0.54      0.54      9930
weighted avg       0.54      0.54      0.54      9930

Validation Confusion Matrix:
[[2585 2324]
 [2259 2762]]
Testing Accuracy: 0.5367573011077543
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.52      0.53      2527
           1       0.53      0.56      0.54      2438

    accuracy                           0.54      4965
   macro avg       0.54      0.54      0.54      4965
weighted avg       0.54      0.54      0.54      4965

Testing Confusion Matrix:
[[1311 1216]
 [1084 1354]]


In [374]:
#KNN model with 3 neighbors
knn2 = KNeighborsClassifier(n_neighbors=3)

# Train the model on the training data
knn2.fit(X_train, y_train)

y_val_pred2 = knn2.predict(X_val)

y_test_pred2 = knn2.predict(X_test)

# Evaluate the model on the validation set
val_accuracy2 = accuracy_score(y_val, y_val_pred2)
print("Validation Accuracy:", val_accuracy2)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred2))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred2))

# Evaluate the model on the testing set
test_accuracy2 = accuracy_score(y_test, y_test_pred2)
print("Testing Accuracy:", test_accuracy2)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred2))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred2))




Validation Accuracy: 0.537361530715005
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.52      0.53      4909
           1       0.54      0.56      0.55      5021

    accuracy                           0.54      9930
   macro avg       0.54      0.54      0.54      9930
weighted avg       0.54      0.54      0.54      9930

Validation Confusion Matrix:
[[2544 2365]
 [2229 2792]]
Testing Accuracy: 0.5417925478348439
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.53      0.54      2527
           1       0.53      0.55      0.54      2438

    accuracy                           0.54      4965
   macro avg       0.54      0.54      0.54      4965
weighted avg       0.54      0.54      0.54      4965

Testing Confusion Matrix:
[[1341 1186]
 [1089 1349]]


In [376]:
#KNN model with 7 neighbors
knn3 = KNeighborsClassifier(n_neighbors=7)

# Train the model on the training data
knn3.fit(X_train, y_train)

y_val_pred3 = knn3.predict(X_val)

y_test_pred3 = knn3.predict(X_test)

# Evaluate the model on the validation set
val_accuracy3 = accuracy_score(y_val, y_val_pred3)
print("Validation Accuracy:", val_accuracy3)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred3))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred3))

# Evaluate the model on the testing set
test_accuracy3 = accuracy_score(y_test, y_test_pred3)
print("Testing Accuracy:", test_accuracy3)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred3))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred3))

Validation Accuracy: 0.5477341389728096
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.53      0.54      4909
           1       0.55      0.57      0.56      5021

    accuracy                           0.55      9930
   macro avg       0.55      0.55      0.55      9930
weighted avg       0.55      0.55      0.55      9930

Validation Confusion Matrix:
[[2594 2315]
 [2176 2845]]
Testing Accuracy: 0.5438066465256798
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.52      0.54      2527
           1       0.53      0.57      0.55      2438

    accuracy                           0.54      4965
   macro avg       0.54      0.54      0.54      4965
weighted avg       0.54      0.54      0.54      4965

Testing Confusion Matrix:
[[1319 1208]
 [1057 1381]]


In [378]:
#KNN model with 9 neighbors
knn4 = KNeighborsClassifier(n_neighbors=9)

# Train the model on the training data
knn4.fit(X_train, y_train)

y_val_pred4 = knn4.predict(X_val)

y_test_pred4 = knn4.predict(X_test)

# Evaluate the model on the validation set
val_accuracy4 = accuracy_score(y_val, y_val_pred4)
print("Validation Accuracy:", val_accuracy4)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred4))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred4))

# Evaluate the model on the testing set
test_accuracy4 = accuracy_score(y_test, y_test_pred4)
print("Testing Accuracy:", test_accuracy4)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred4))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred4))


Validation Accuracy: 0.5564954682779456
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.54      0.54      4909
           1       0.56      0.58      0.57      5021

    accuracy                           0.56      9930
   macro avg       0.56      0.56      0.56      9930
weighted avg       0.56      0.56      0.56      9930

Validation Confusion Matrix:
[[2633 2276]
 [2128 2893]]
Testing Accuracy: 0.5436052366565962
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.52      0.54      2527
           1       0.53      0.57      0.55      2438

    accuracy                           0.54      4965
   macro avg       0.54      0.54      0.54      4965
weighted avg       0.54      0.54      0.54      4965

Testing Confusion Matrix:
[[1319 1208]
 [1058 1380]]


In [380]:
#KNN model with 11 neighbors
knn5 = KNeighborsClassifier(n_neighbors=11)

# Train the model on the training data
knn5.fit(X_train, y_train)

y_val_pred5 = knn5.predict(X_val)

y_test_pred5 = knn5.predict(X_test)

# Evaluate the model on the validation set
val_accuracy5 = accuracy_score(y_val, y_val_pred5)
print("Validation Accuracy:", val_accuracy5)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred5))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred5))

# Evaluate the model on the testing set
test_accuracy5 = accuracy_score(y_test, y_test_pred5)
print("Testing Accuracy:", test_accuracy5)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred5))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred5))

Validation Accuracy: 0.5583081570996978
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.54      0.55      4909
           1       0.56      0.58      0.57      5021

    accuracy                           0.56      9930
   macro avg       0.56      0.56      0.56      9930
weighted avg       0.56      0.56      0.56      9930

Validation Confusion Matrix:
[[2653 2256]
 [2130 2891]]
Testing Accuracy: 0.5458207452165156
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.52      0.54      2527
           1       0.53      0.58      0.56      2438

    accuracy                           0.55      4965
   macro avg       0.55      0.55      0.55      4965
weighted avg       0.55      0.55      0.55      4965

Testing Confusion Matrix:
[[1302 1225]
 [1030 1408]]


In [382]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = logreg.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Make predictions on the testing set
y_test_pred = logreg.predict(X_test)

# Evaluate the model on the testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))




Validation Accuracy: 0.5865055387713998
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.56      0.57      4909
           1       0.59      0.62      0.60      5021

    accuracy                           0.59      9930
   macro avg       0.59      0.59      0.59      9930
weighted avg       0.59      0.59      0.59      9930

Validation Confusion Matrix:
[[2735 2174]
 [1932 3089]]
Testing Accuracy: 0.5756294058408862
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.55      0.57      2527
           1       0.56      0.60      0.58      2438

    accuracy                           0.58      4965
   macro avg       0.58      0.58      0.58      4965
weighted avg       0.58      0.58      0.58      4965

Testing Confusion Matrix:
[[1402 1125]
 [ 982 1456]]


In [384]:
#Bernoulli Naive Bayes because so many fields are boolean
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = bnb.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Make predictions on the testing set
y_test_pred = bnb.predict(X_test)

# Evaluate the model on the testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.5559919436052366
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.55      0.55      4909
           1       0.56      0.56      0.56      5021

    accuracy                           0.56      9930
   macro avg       0.56      0.56      0.56      9930
weighted avg       0.56      0.56      0.56      9930

Validation Confusion Matrix:
[[2715 2194]
 [2215 2806]]
Testing Accuracy: 0.5570996978851964
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.54      0.56      2527
           1       0.55      0.57      0.56      2438

    accuracy                           0.56      4965
   macro avg       0.56      0.56      0.56      4965
weighted avg       0.56      0.56      0.56      4965

Testing Confusion Matrix:
[[1374 1153]
 [1046 1392]]


In [386]:
#Random Forest Classifier because so many fields are boolean
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = rfc.predict(X_val)

# Evaluate the model on the validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:")
print(classification_report(y_val, y_val_pred))
print("Validation Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# Make predictions on the testing set
y_test_pred = rfc.predict(X_test)

# Evaluate the model on the testing set
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
print("Testing Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Testing Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Validation Accuracy: 0.5726082578046324
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.58      0.57      4909
           1       0.58      0.57      0.57      5021

    accuracy                           0.57      9930
   macro avg       0.57      0.57      0.57      9930
weighted avg       0.57      0.57      0.57      9930

Validation Confusion Matrix:
[[2824 2085]
 [2159 2862]]
Testing Accuracy: 0.570392749244713
Testing Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.57      0.58      2527
           1       0.56      0.57      0.56      2438

    accuracy                           0.57      4965
   macro avg       0.57      0.57      0.57      4965
weighted avg       0.57      0.57      0.57      4965

Testing Confusion Matrix:
[[1452 1075]
 [1058 1380]]
