In [7]:
from pymongo.mongo_client import MongoClient
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from time import time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
	

### Data Ingestion

In [8]:
uri = "mongodb+srv://root:root@cluster0.k3s4vuf.mongodb.net/?retryWrites=true&w=majority&ssl=true"
client = MongoClient(uri)
collection = client["credit_card_defaults"]["data"]
data = list(collection.find())
df = pd.DataFrame(data)
df.sample(4)

#### View Columns

In [None]:
df.columns

Index(['_id', 'ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
       'PAY_SEPT', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR',
       'BILL_AMT_SEPT', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN',
       'BILL_AMT_MAY', 'BILL_AMT_APR', 'PAY_AMT_SEPT', 'PAY_AMT_AUG',
       'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR',
       'DEFAULT_PAYMENT'],
      dtype='object')

### Split dataset into Train & Test

In [None]:
train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)

In [None]:
# Custom transformer to apply get_dummies to selected columns
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X, columns=self.columns, drop_first=True)


In [None]:
# Update column values
def update_column_values(df):
    # Modify 'EDUCATION' column
    fil_education = (df['EDUCATION'] == 5) | (df['EDUCATION'] == 6) | (df['EDUCATION'] == 0)
    df.loc[fil_education, 'EDUCATION'] = 4

    # Modify 'MARRIAGE' column
    fil_marriage = df['MARRIAGE'] == 0
    df.loc[fil_marriage, 'MARRIAGE'] = 3

    print("EDUCATION & MARRIAGE column's values are merged which has lesser counts")
    return df

In [None]:
def transform_data():
    numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT_SEPT', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR',
                           'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR']

    categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_SEPT', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR']


    num_pipeline = Pipeline(
        steps=[
            ('scaler', StandardScaler())
        ])

    cat_pipeline = Pipeline(
        steps=[
            # ('onehotencoder', OneHotEncoder(sparse_output=False,handle_unknown='ignore', categories='auto')),
            ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_features),
        ('cat_pipeline', cat_pipeline, categorical_features)
    ], remainder='passthrough')

    return preprocessor

In [None]:
# Handle imbalance data
def smote_balance(data):
    
    target_column_name = 'DEFAULT_PAYMENT'
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    
    print('Dataset shape prior resampling: {}'.format(data.shape[0]))
    X_resampled, y_resampled = sm.fit_resample(X=data.drop(columns=target_column_name), y=data[target_column_name])
    data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)
    print('Dataset shape after resampling: {}'.format(data.shape[0]))
    return data


In [None]:
def evaluate_models(models: dict, train_features, train_label, test_features, test_label, metric='accuracy'):
    np.random.seed(42)        
    MODEL_REPORT = {}

    def predict(model_name, model, features, label):
        pred_label = model.predict(features)      
        MODEL_REPORT[model_name] = {
        'model': model,
        'accuracy': accuracy_score(y_true=label, y_pred=pred_label),
        'f1': f1_score(y_true=label, y_pred=pred_label),
        'precision': precision_score(y_true=label, y_pred=pred_label),
        'recall': recall_score(y_true=label, y_pred=pred_label),
        'roc-auc': roc_auc_score(y_true=label, y_score=pred_label)}
        return MODEL_REPORT
        
    for model_name, model in models.items():            
        print("\n\n========================= {} =======================".format(model_name))
        start = time()
        model.fit(train_features, train_label)
        end = time()
        print("Model took: {} secs".format(round(end-start, 4)))

        # Evaluate the best model on the train & test set
        print("Predicting Training dataset")
        pred_label = model.predict(train_features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=train_label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=train_label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=train_label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=train_label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=train_label, y_score=pred_label))

        print("\nPredicting Test dataset")
        pred_label = model.predict(test_features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=test_label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=test_label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=test_label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=test_label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=test_label, y_score=pred_label))


In [None]:
def evaluate_models_with_hyperparameter(models: tuple, train_features, train_label, test_features, test_label, metric='accuracy'):

    def predict(model_name, model, features, label):
        pred_label = model.predict(features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=label, y_score=pred_label))
    
    def find_model_by_score(dictionary, target_value):
        for key, value in dictionary.items():
            if value == target_value:
                return key
        return None

    np.random.seed(42)        
    TRAINING_SCORE = {}
    for items in models:
        for model, param in items.items():                
            model_name = str(model).split("()")[0]
            print("\n\n========================= {} =======================".format(model_name))
            start = time()
            cv = GridSearchCV(estimator=model, param_grid=param, cv=3, n_jobs=-1, scoring=metric)
            cv.fit(train_features, train_label)
            end = time()
            print("BEST PARAMS: {}".format(cv.best_params_))
            print("BEST SCORE: {}".format(cv.best_score_))
            print("Model took: {} secs".format(round(end-start, 4)))
            TRAINING_SCORE[cv.best_estimator_] = cv.best_score_

    print("All training scores: {}".format(TRAINING_SCORE))

    best_score = sorted([value for key, value in TRAINING_SCORE.items()], reverse=True)[0]
    best_model = find_model_by_score(TRAINING_SCORE, best_score)
    
    model_name = str(best_model).split("()")[0]
    print("\nPredicting Train dataset")
    predict(model_name=model_name, model=best_model, features=train_features, label=train_label)

    print("\nPredicting Test dataset")
    predict(model_name=model_name, model=best_model, features=test_features, label=test_label)

    # print("BEST MODEL: {}".format(model_name))
    # print("TESTING SCORES: {}".format(MODEL_REPORT[model_name]))

    # return best_model

In [None]:
train_data = train_data.drop(columns=['_id'], axis=1)
train_data = smote_balance(train_data)

test_data = test_data.drop(columns=['_id'], axis=1)
test_data = smote_balance(test_data)

Dataset shape prior resampling: 22500
Dataset shape after resampling: 34982
Dataset shape prior resampling: 7500
Dataset shape after resampling: 11746


In [None]:
train_data = update_column_values(train_data)
test_data = update_column_values(test_data)

EDUCATION & MARRIAGE column's values are merged which has lesser counts
EDUCATION & MARRIAGE column's values are merged which has lesser counts


In [None]:
train_X_data = train_data.drop(columns=['ID', 'DEFAULT_PAYMENT'])
train_y_data = train_data['DEFAULT_PAYMENT']

test_X_data = test_data.drop(columns=['ID', 'DEFAULT_PAYMENT'])
test_y_data = test_data['DEFAULT_PAYMENT']

In [None]:
preprocessor = transform_data()
preprocessor.fit(train_X_data)

train_X_data_arr = preprocessor.transform(train_X_data)
test_X_data_arr = preprocessor.transform(test_X_data)

train_X_data_arr = pd.DataFrame(train_X_data_arr, columns=preprocessor.get_feature_names_out())
test_X_data_arr = pd.DataFrame(test_X_data_arr, columns=preprocessor.get_feature_names_out())

train_df = pd.concat([train_X_data_arr, train_y_data], axis=1)
test_df = pd.concat([test_X_data_arr, test_y_data], axis=1)

In [None]:
X_train, y_train, X_test, y_test = (train_df.iloc[:, :-1], train_df.iloc[:, -1], test_df.iloc[:, :-1], test_df.iloc[:, -1])

In [None]:
models = {
            'DecisionTree': DecisionTreeClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(),
            'RandomForest': RandomForestClassifier(),
            'NearestNeighbors': KNeighborsClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'AdaBoost': AdaBoostClassifier(),
            'NaiveBayes': GaussianNB()
            }

In [None]:
# evaluate_models(models, X_train, y_train, X_test, y_test, metric="accuracy")


In [None]:
hyper_parameter_models = (
                {
                    GaussianNB(): {'var_smoothing': np.logspace(0,-9, num=100)}},
                {
                    LogisticRegression(max_iter=1000): {'penalty':['l1','l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000] }},
                {
                    SVC(): {'C': [0.1, 1, 10, 100, 500], 'kernel': ['rbf', 'poly']} },
                {   
                    AdaBoostClassifier(): {'n_estimators': [100, 500, 1000, 5000]}},
                {
                    RandomForestClassifier(): {'n_estimators': [100,150,200, 500], 'max_depth': [10,20,30, 50]}},
                {
                    GradientBoostingClassifier(): { 
                        'n_estimators': [100, 500, 1000, 5000],
                        'max_depth': [5,10,20], 
                        'min_samples_split': [100, 500, 2000],
                        'min_samples_leaf': [30, 50, 70],
                        'max_features': [5, 10, 40],
                        'subsample': [0.6,0.7,0.75,0.8,0.85,0.9]
                        }},
                {
                    KNeighborsClassifier(): { 
                        'n_neighbors': [2, 5, 7, 9, 11, 13, 15, 30, 60],
                        'weights': ['uniform', 'distance'],
                        'metric': ['minkowski', 'euclidean', 'manhattan'],
                        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
                        }},
                {
                    DecisionTreeClassifier(): {'max_depth': [20,30,50,100], 'min_samples_split':[0.1,0.2,0.4]}}
                )

In [None]:
evaluate_models_with_hyperparameter(hyper_parameter_models, X_train, y_train, X_test, y_test, metric="accuracy")



BEST PARAMS: {'var_smoothing': 0.533669923120631}
BEST SCORE: 0.6048545044348667
Model took: 4.0418 secs


BEST PARAMS: {'C': 0.001, 'penalty': 'l2'}
BEST SCORE: 0.7177411875966807
Model took: 0.5246 secs


BEST PARAMS: {'C': 100, 'kernel': 'rbf'}
BEST SCORE: 0.7607639245898854
Model took: 1849.0344 secs


BEST PARAMS: {'n_estimators': 5000}
BEST SCORE: 0.7503020776227061
Model took: 644.0108 secs


BEST PARAMS: {'max_depth': 30, 'n_estimators': 200}
BEST SCORE: 0.8184509295350463
Model took: 185.6947 secs


