In [124]:
from pymongo.mongo_client import MongoClient
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split

from time import time

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')
	

### Data Ingestion

In [15]:
uri = "mongodb+srv://root:root@cluster0.k3s4vuf.mongodb.net/?retryWrites=true&w=majority&ssl=true"
client = MongoClient(uri)
collection = client["credit_card_defaults"]["data"]
data = list(collection.find())
df = pd.DataFrame(data)
# Drop ID feature
df = df.drop(columns=['_id'], axis=1)
df.sample(4)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_SEPT,PAY_AUG,PAY_JUL,PAY_JUN,...,BILL_AMT_JUN,BILL_AMT_MAY,BILL_AMT_APR,PAY_AMT_SEPT,PAY_AMT_AUG,PAY_AMT_JUL,PAY_AMT_JUN,PAY_AMT_MAY,PAY_AMT_APR,DEFAULT_PAYMENT
21205,21206,150000,1,2,1,40,0,0,0,0,...,28265,25402,21514,35749,3016,1201,601,2002,1020,0
27477,27478,300000,1,2,1,51,1,2,0,0,...,204116,214691,218101,0,8000,9000,15500,7000,10000,0
20306,20307,30000,2,2,1,56,0,0,0,2,...,13580,13657,16356,1510,3042,0,600,2942,600,1
20627,20628,360000,1,1,2,29,-1,2,-1,-1,...,2868,425,29879,141,2618,2868,425,29879,18780,0


#### View Columns

In [16]:
df.columns

Index(['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_SEPT',
       'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR', 'BILL_AMT_SEPT',
       'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY',
       'BILL_AMT_APR', 'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL',
       'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR', 'DEFAULT_PAYMENT'],
      dtype='object')

In [4]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_selection import SelectFromModel

# # Drop ID feature
# df = df.drop(columns=['_id'], axis=1)

# X = df.drop(columns=['DEFAULT_PAYMENT'], axis=1)
# y = df['DEFAULT_PAYMENT']
# model = RandomForestClassifier(random_state=42, max_depth=50)
# model.fit(X, y)

# feature = SelectFromModel(model)
# fit = feature.fit_transform(X, y)
# df_fe_selected = pd.DataFrame(fit, columns=feature.get_feature_names_out())

In [5]:
# df_original = df

In [6]:
# df = pd.merge(df_fe_selected, df[['ID', 'DEFAULT_PAYMENT']], on='ID', how='inner')

In [7]:
# df.columns

Index(['ID', 'LIMIT_BAL', 'AGE', 'PAY_SEPT', 'PAY_AUG', 'BILL_AMT_SEPT',
       'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY',
       'BILL_AMT_APR', 'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL',
       'PAY_AMT_APR', 'DEFAULT_PAYMENT'],
      dtype='object')

### Split dataset into Train & Test

In [22]:
train_data, test_data = train_test_split(df, test_size=0.33, random_state=42)

In [10]:
# Custom transformer to apply get_dummies to selected columns
class GetDummiesTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.get_dummies(X, columns=self.columns, drop_first=True)


In [18]:
# Update column values
def update_column_values(df):
    # Modify 'EDUCATION' column
    # fil_education = (df['EDUCATION'] == 5) | (df['EDUCATION'] == 6) | (df['EDUCATION'] == 0)
    # df.loc[fil_education, 'EDUCATION'] = 4

    df['EDUCATION'] = df['EDUCATION'].map({0:4, 1:1, 2:2, 3:3, 4:4, 5:4, 6:4})

    # Modify 'MARRIAGE' column
    # fil_marriage = df['MARRIAGE'] == 0
    # df.loc[fil_marriage, 'MARRIAGE'] = 3

    df['MARRIAGE'] = df['MARRIAGE'].map({0:3, 1:1, 2:2, 3:3})

    print("EDUCATION & MARRIAGE column's values are merged which has lesser counts")
    return df

In [19]:
def transform_data():
    numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT_SEPT', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR',
                           'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_JUN', 'PAY_AMT_MAY', 'PAY_AMT_APR']

    categorical_features = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_SEPT', 'PAY_AUG', 'PAY_JUL', 'PAY_JUN', 'PAY_MAY', 'PAY_APR']

    # numerical_features = ['LIMIT_BAL', 'AGE', 'BILL_AMT_SEPT', 'BILL_AMT_AUG', 'BILL_AMT_JUL', 'BILL_AMT_JUN', 'BILL_AMT_MAY', 'BILL_AMT_APR',
    #                        'PAY_AMT_SEPT', 'PAY_AMT_AUG', 'PAY_AMT_JUL', 'PAY_AMT_APR']

    # categorical_features = ['PAY_SEPT']


    num_pipeline = Pipeline(
        steps=[
            ('scaler', StandardScaler())
        ])

    cat_pipeline = Pipeline(
        steps=[
            # ('onehotencoder', OneHotEncoder(sparse_output=False,handle_unknown='ignore', categories='auto')),
            ('scaler', StandardScaler())
        ]
    )

    preprocessor = ColumnTransformer([
        ('num_pipeline', num_pipeline, numerical_features),
        ('cat_pipeline', cat_pipeline, categorical_features)
    ], remainder='passthrough')

    return preprocessor

In [20]:
# Handle imbalance data
def smote_balance(data):
    
    target_column_name = 'DEFAULT_PAYMENT'
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    
    print('Dataset shape prior resampling: {}'.format(data.shape[0]))
    X_resampled, y_resampled = sm.fit_resample(X=data.drop(columns=target_column_name), y=data[target_column_name])
    data = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled)], axis=1)
    print('Dataset shape after resampling: {}'.format(data.shape[0]))
    return data


In [14]:
def evaluate_models(models: dict, train_features, train_label, test_features, test_label, metric='accuracy'):
    np.random.seed(42)        
    MODEL_REPORT = {}

    def predict(model_name, model, features, label):
        pred_label = model.predict(features)      
        MODEL_REPORT[model_name] = {
        'model': model,
        'accuracy': accuracy_score(y_true=label, y_pred=pred_label),
        'f1': f1_score(y_true=label, y_pred=pred_label),
        'precision': precision_score(y_true=label, y_pred=pred_label),
        'recall': recall_score(y_true=label, y_pred=pred_label),
        'roc-auc': roc_auc_score(y_true=label, y_score=pred_label)}
        return MODEL_REPORT
        
    for model_name, model in models.items():            
        print("\n\n========================= {} =======================".format(model_name))
        start = time()
        model.fit(train_features, train_label)
        end = time()
        print("Model took: {} secs".format(round(end-start, 4)))

        # Evaluate the best model on the train & test set
        print("Predicting Training dataset")
        pred_label = model.predict(train_features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=train_label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=train_label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=train_label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=train_label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=train_label, y_score=pred_label))

        print("\nPredicting Test dataset")
        pred_label = model.predict(test_features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=test_label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=test_label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=test_label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=test_label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=test_label, y_score=pred_label))


In [15]:
def evaluate_models_with_hyperparameter(models: tuple, train_features, train_label, test_features, test_label, metric='accuracy'):

    def predict(model_name, model, features, label):
        pred_label = model.predict(features)      
        print('model: ', model)
        print('accuracy: ', accuracy_score(y_true=label, y_pred=pred_label))
        print('f1: ', f1_score(y_true=label, y_pred=pred_label))
        print('precision: ', precision_score(y_true=label, y_pred=pred_label))
        print('recall: ', recall_score(y_true=label, y_pred=pred_label))
        print('roc-auc: ', roc_auc_score(y_true=label, y_score=pred_label))
    
    def find_model_by_score(dictionary, target_value):
        for key, value in dictionary.items():
            if value == target_value:
                return key
        return None

    np.random.seed(42)        
    TRAINING_SCORE = {}
    for items in models:
        for model, param in items.items():                
            model_name = str(model).split("()")[0]
            print("\n\n========================= {} =======================".format(model_name))
            start = time()
            cv = GridSearchCV(estimator=model, param_grid=param, cv=3, n_jobs=-1, scoring=metric, verbose=3)
            cv.fit(train_features, train_label)
            end = time()
            print("BEST PARAMS: {}".format(cv.best_params_))
            print("BEST SCORE: {}".format(cv.best_score_))
            print("Model took: {} secs".format(round(end-start, 4)))
            TRAINING_SCORE[cv.best_estimator_] = cv.best_score_

    print("All training scores: {}".format(TRAINING_SCORE))

    best_score = sorted([value for key, value in TRAINING_SCORE.items()], reverse=True)[0]
    best_model = find_model_by_score(TRAINING_SCORE, best_score)
    
    model_name = str(best_model).split("()")[0]
    print("\nPredicting Train dataset")
    predict(model_name=model_name, model=best_model, features=train_features, label=train_label)

    print("\nPredicting Test dataset")
    predict(model_name=model_name, model=best_model, features=test_features, label=test_label)

    # print("BEST MODEL: {}".format(model_name))
    # print("TESTING SCORES: {}".format(MODEL_REPORT[model_name]))

    # return best_model

In [23]:
# train_data = train_data.drop(columns=['_id'], axis=1)
train_data = smote_balance(train_data)

# test_data = test_data.drop(columns=['_id'], axis=1)
test_data = smote_balance(test_data)

Dataset shape prior resampling: 20100
Dataset shape after resampling: 31244
Dataset shape prior resampling: 9900
Dataset shape after resampling: 15484


In [24]:
train_data = update_column_values(train_data)
test_data = update_column_values(test_data)

EDUCATION & MARRIAGE column's values are merged which has lesser counts
EDUCATION & MARRIAGE column's values are merged which has lesser counts


In [25]:
train_X_data = train_data.drop(columns=['ID', 'DEFAULT_PAYMENT'])
train_y_data = train_data['DEFAULT_PAYMENT']

test_X_data = test_data.drop(columns=['ID', 'DEFAULT_PAYMENT'])
test_y_data = test_data['DEFAULT_PAYMENT']

In [26]:
preprocessor = transform_data()
preprocessor.fit(train_X_data)

train_X_data_arr = preprocessor.transform(train_X_data)
test_X_data_arr = preprocessor.transform(test_X_data)

train_X_data_arr = pd.DataFrame(train_X_data_arr, columns=preprocessor.get_feature_names_out())
test_X_data_arr = pd.DataFrame(test_X_data_arr, columns=preprocessor.get_feature_names_out())

train_df = pd.concat([train_X_data_arr, train_y_data], axis=1)
test_df = pd.concat([test_X_data_arr, test_y_data], axis=1)

In [27]:
X_train, y_train, X_test, y_test = (train_df.iloc[:, :-1], train_df.iloc[:, -1], test_df.iloc[:, :-1], test_df.iloc[:, -1])

In [21]:
models = {
            'DecisionTree': DecisionTreeClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(),
            'RandomForest': RandomForestClassifier(),
            'NearestNeighbors': KNeighborsClassifier(),
            'GradientBoosting': GradientBoostingClassifier(),
            'AdaBoost': AdaBoostClassifier(),
            'NaiveBayes': GaussianNB()
            }

In [22]:
# evaluate_models(models, X_train, y_train, X_test, y_test, metric="accuracy")


In [23]:
# hyper_parameter_models = (
#                 {
#                     LogisticRegression(max_iter=1000): {'penalty':['l1','l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000] }},
#                 {
#                     SVC(): {'C': [10, 100], 'kernel': ['rbf', 'poly']} },
#                 {
#                     RandomForestClassifier(): {'n_estimators': [100,200,500,1000], 'max_depth': [10,30, 50,90]}},
#                 {
#                     GradientBoostingClassifier(): { 
#                         'n_estimators': [100, 500, 1000, 5000],
#                         'max_depth': [5,10,20], 
#                         'min_samples_split': [100, 500, 2000],
#                         'min_samples_leaf': [30, 50, 70],
#                         'max_features': [5, 10, 40]
#                         }},
#                 {
#                     KNeighborsClassifier(): { 
#                         'n_neighbors': [2, 5, 7, 9, 11, 13, 15, 30, 60],
#                         'weights': ['uniform', 'distance'],
#                         'metric': ['minkowski', 'euclidean', 'manhattan'],
#                         "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
#                         }},
#                 {
#                     DecisionTreeClassifier(): {'max_depth': [20,30,50,100], 'min_samples_split':[0.1,0.2,0.4]}}
#                 )

In [24]:
# evaluate_models_with_hyperparameter(hyper_parameter_models, X_train, y_train, X_test, y_test, metric="accuracy")

In [32]:
hyper_parameter_models = (
               {
                    GradientBoostingClassifier(): { 
                        'n_estimators': [100],
                        'max_depth': [50]
                        }},)

In [33]:
evaluate_models_with_hyperparameter(hyper_parameter_models, X_train, y_train, X_test, y_test, metric="accuracy")



Fitting 3 folds for each of 1 candidates, totalling 3 fits
BEST PARAMS: {'max_depth': 50, 'n_estimators': 100}
BEST SCORE: 0.7438693133430307
Model took: 408.9475 secs
All training scores: {GradientBoostingClassifier(max_depth=50): 0.7438693133430307}

Predicting Train dataset
model:  GradientBoostingClassifier(max_depth=50)
accuracy:  0.9983991767194558
f1:  0.998398260969052
precision:  0.998969721252361
recall:  0.9978274541192613
roc-auc:  0.9983991767194557

Predicting Test dataset
model:  GradientBoostingClassifier(max_depth=50)
accuracy:  0.6918951132300357
f1:  0.6719840478564307
precision:  0.7184108527131783
recall:  0.6311935978205346
roc-auc:  0.6918951132300358


In [42]:
def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print("\n Time taken: {} hours {} minutes and {} seconds".format(thour, tmin, round(tsec, 2)))

### XGBoostClassifier

In [128]:
params = {
    'learning_rate'         : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    'max-depth'             : [3, 4, 5, 6, 8, 10, 12, 15],
    'min_child_weight'      : [1, 3, 5, 7],
    'gamma'                 : [0.0, 0.1, 0.2, 0.3, 0.4],
    'colsample_bytree'      : [0.3, 0.4, 0.5, 0.7], 
}

random_search = RandomizedSearchCV(estimator=XGBClassifier(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=5)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

print(random_search.best_estimator_)
print(random_search.best_score_)

print("Accuracy score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='accuracy').mean())
print("Precision score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='precision').mean())
print("Recall score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='recall').mean())
print("F1 Score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='f1').mean())
print("ROC-AUC score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='roc_auc').mean())

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0.0 hours 0.0 minutes and 1.99 seconds
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.7, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=0.0, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max-depth=4,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=5, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, ...)
0.8980945050421256
Accuracy score:  0.8275217569724903
Precision score:  0.8377645782080642
Recall score:  0.8041958823039094
F1 Score:  0.81

### GradientBoostingClassifier

In [122]:
params = { 
                        'n_estimators': [5,10, 15],
                        'max_depth': [5,10,20], 
                        'min_samples_split': [10, 20, 30],
                        'min_samples_leaf': [10, 20, 30],
                        'max_features': [5, 10, 40]
                        }

random_search = RandomizedSearchCV(estimator=GradientBoostingClassifier(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=5)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

print(random_search.best_estimator_)
print("Best Score: ", random_search.best_score_)

print("Accuracy score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='accuracy').mean())
print("Precision score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='precision').mean())
print("Recall score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='recall').mean())
print("F1 Score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='f1').mean())
print("ROC-AUC score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='roc_auc').mean())

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0.0 hours 0.0 minutes and 52.29 seconds
GradientBoostingClassifier(max_depth=20, max_features=40, min_samples_leaf=10,
                           min_samples_split=10, n_estimators=15)
0.8873018702736732
Accuracy score:  0.8171822948205311
Precision score:  0.8232198406243618
Recall score:  0.8035450529298991
F1 Score:  0.8066304492067266
ROC-AUC score:  0.8971379940161611


### KNeighbourClassifier

In [126]:
params = { 
                        'n_neighbors': [2, 5, 15, 30],
                        'weights': ['uniform', 'distance'],
                        'metric': ['minkowski', 'euclidean', 'manhattan'],
                        "algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
                        }

random_search = RandomizedSearchCV(estimator=KNeighborsClassifier(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=5)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

print(random_search.best_estimator_)
print(random_search.best_score_)

print("Accuracy score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='accuracy').mean())
print("Precision score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='precision').mean())
print("Recall score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='recall').mean())
print("F1 Score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='f1').mean())
print("ROC-AUC score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='roc_auc').mean())

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0.0 hours 0.0 minutes and 25.14 seconds
KNeighborsClassifier(algorithm='kd_tree', metric='manhattan',
                     weights='distance')
0.8715987083370905
Accuracy score:  0.7986407835012336
Precision score:  0.7627990114998968
Recall score:  0.864508127031758
F1 Score:  0.8089565836848955
ROC-AUC score:  0.8836744040661328


### RandomForestClassifier

In [127]:
params = {
    'n_estimators': [5,10, 15, 30], 
    'max_depth': [5,20,50,100]}

random_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=5)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

print(random_search.best_estimator_)
print("Best Score: ", random_search.best_score_)

print("Accuracy score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='accuracy').mean())
print("Precision score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='precision').mean())
print("Recall score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='recall').mean())
print("F1 Score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='f1').mean())
print("ROC-AUC score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='roc_auc').mean())

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0.0 hours 0.0 minutes and 13.23 seconds
RandomForestClassifier(max_depth=20, n_estimators=30)
Best Score:  0.9073772375873211
Accuracy score:  0.8370729719765857
Precision score:  0.8408436109234966
Recall score:  0.8177534383595898
F1 Score:  0.8301652461256779
ROC-AUC score:  0.914946562545029


### SVC

In [131]:
params = {
    'C': [2, 5, 10], 
    'kernel': ['rbf', 'poly']} 

random_search = RandomizedSearchCV(estimator=SVC(), param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3, random_state=5)

start_time = timer(None)
random_search.fit(X_train, y_train)
timer(start_time)

print(random_search.best_estimator_)
print("Best Score: ", random_search.best_score_)

print("Accuracy score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='accuracy').mean())
print("Precision score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='precision').mean())
print("Recall score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='recall').mean())
print("F1 Score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='f1').mean())
print("ROC-AUC score: ", cross_val_score(random_search.best_estimator_, X_test, y_test, cv=10, n_jobs=-1, scoring='roc_auc').mean())

Fitting 5 folds for each of 5 candidates, totalling 25 fits

 Time taken: 0.0 hours 3.0 minutes and 3.82 seconds
SVC(C=10)
Best Score:  0.8474395341603806
Accuracy score:  0.7831445393627297
Precision score:  0.7900689902203775
Recall score:  0.765827456864216
F1 Score:  0.7745447183529446
ROC-AUC score:  0.8566740483570505
