In [1]:
pip install numpy pandas matplotlib seaborn scikit-learn imbalanced-learn xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [150]:
# Numerical and Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Handling Imbalanced Datasets
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Gradient Boosting Algorithms
import xgboost as xgb
import lightgbm as lgb
import catboost as cb



In [152]:
file_path = '/Users/liamr/Downloads/application_data.csv'

data = pd.read_csv(file_path)

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR
dtypes: float64(65), int64(41), object(16)
memory usage: 286.2+ MB
None


In [154]:
# Print columns with missing values
missing_columns = data.columns[data.isnull().any()]
print(data[missing_columns].isnull().sum())

AMT_ANNUITY                       12
AMT_GOODS_PRICE                  278
NAME_TYPE_SUITE                 1292
OWN_CAR_AGE                   202929
OCCUPATION_TYPE                96391
                               ...  
AMT_REQ_CREDIT_BUREAU_DAY      41519
AMT_REQ_CREDIT_BUREAU_WEEK     41519
AMT_REQ_CREDIT_BUREAU_MON      41519
AMT_REQ_CREDIT_BUREAU_QRT      41519
AMT_REQ_CREDIT_BUREAU_YEAR     41519
Length: 67, dtype: int64


In [156]:
df_cleaned = data.dropna(axis=1)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 55 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   SK_ID_CURR                   307511 non-null  int64  
 1   TARGET                       307511 non-null  int64  
 2   NAME_CONTRACT_TYPE           307511 non-null  object 
 3   CODE_GENDER                  307511 non-null  object 
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   NAME_INCOME_TYPE             307511 non-null  object 
 10  NAME_EDUCATION_TYPE          307511 non-null  object 
 11  NAME_FAMILY_STATUS           307511 non-null  object 
 12  NAME_HOUSING_TYPE            307511 non-null  object 
 13 

In [184]:
# Drop all columns with dtype 'object'
df_cleaned = df_cleaned.drop(df_cleaned.select_dtypes(include=['object']).columns, axis=1)
X = df_cleaned.drop(columns=['TARGET'])
y = df_cleaned['TARGET']
print("Column Names:", X.columns.tolist())
X.head()

Column Names: ['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']


Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,FLAG_MOBIL,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,100002,0,202500.0,406597.5,0.018801,-9461,-637,-3648.0,-2120,1,...,0,0,0,0,0,0,0,0,0,0
1,100003,0,270000.0,1293502.5,0.003541,-16765,-1188,-1186.0,-291,1,...,0,0,0,0,0,0,0,0,0,0
2,100004,0,67500.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,1,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,135000.0,312682.5,0.008019,-19005,-3039,-9833.0,-2437,1,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,121500.0,513000.0,0.028663,-19932,-3038,-4311.0,-3458,1,...,0,0,0,0,0,0,0,0,0,0


In [180]:
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

scaled_X


array([[-1.73342255, -0.57753784,  0.14212925, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [-1.73341283, -0.57753784,  0.42679193, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [-1.7334031 , -0.57753784, -0.4271961 , ..., -0.02440195,
        -0.02252901, -0.01830463],
       ...,
       [ 1.73239096, -0.57753784, -0.06662338, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [ 1.73240069, -0.57753784,  0.00928667, ..., -0.02440195,
        -0.02252901, -0.01830463],
       [ 1.73241042, -0.57753784, -0.04764587, ..., -0.02440195,
        -0.02252901, -0.01830463]])

In [198]:
#Perform PCA
pca = PCA(n_components=35)  # Retain 2 components
principal_components = pca.fit_transform(scaled_X)

#Explained variance
explained_variance = pca.explained_variance_ratio_
print("Explained Variance Ratio:", explained_variance)
print("Total Explained Variance:", explained_variance.sum())

#Explained variance too low even with more than half of components included, no point, model will have to include everything

Explained Variance Ratio: [0.0883182  0.06221418 0.0483697  0.03487221 0.03199498 0.03051506
 0.02913053 0.0272238  0.02515427 0.02433393 0.0239648  0.0232382
 0.02313144 0.02294731 0.02289425 0.02280275 0.02278474 0.0227467
 0.02273884 0.0227312  0.02272182 0.02272097 0.02264804 0.02258417
 0.02253511 0.02240218 0.0222241  0.02179739 0.02136473 0.02036484
 0.01975386 0.01935736 0.01764461 0.01711713 0.01467629]
Total Explained Variance: 0.942019703735571


In [200]:
# First split: Training (70%) and temporary (30%)
X_train, X_temp, y_train, y_temp = train_test_split(scaled_X, y, test_size=0.3, random_state=42, stratify=y)

# Second split: Validation (20%) and Testing (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42, stratify=y_temp)
#stratify to ensure equal splits between fraudulent and non fraudulent purchases

# Check the sizes of each split
print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Testing set: {X_test.shape}, {y_test.shape}")


Training set: (215257, 44), (215257,)
Validation set: (61810, 44), (61810,)
Testing set: (30444, 44), (30444,)


In [202]:
#KNN model

In [206]:
# Initialize the KNN model with 5 neighbors (default)
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the training data
knn.fit(X_train, y_train)
