In [1]:
# import library
import pandas as pd 
import numpy as np
import joblib
import copy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Create Model

Model :
- KNN
- Logistic Regression
- XGBoost

Define params

In [2]:
# create model param function
def create_model_param():
  """Create the model objedts"""
  knn_params = {
    'n_neighbors' : [50, 100, 200],
  }
  
  lgr_params = {
    'penalty' : ['l1', 'l2'],
    'C' : [0.01, 0.1],
    'max_iter' : [100, 300, 500]
  }
  
  xgb_params = {
    'n_estimators' : [5, 10, 25, 50]
  }
  
  # create model params
  list_of_param = {
    'KNeighborsClassifier' : knn_params,
    'LogisticRegression' : lgr_params,
    'XGBClassifier' : xgb_params
  }
  
  return list_of_param

Define models

In [3]:
# create model object function
def create_model_object():
  """Create the model objects"""
  print('Creating model objects')
  
  # create model objects
  knn = KNeighborsClassifier()
  lgr = LogisticRegression(solver='liblinear')
  xgb = XGBClassifier()
  
  # create list of model
  list_of_model = [
    {'model_name' : knn.__class__.__name__, 'model_object' : knn},
    {'model_name' : lgr.__class__.__name__, 'model_object' : lgr},
    {'model_name' : xgb.__class__.__name__, 'model_object' : xgb}
  ]
  
  return list_of_model

Do the cross validation

In [4]:
# train model function
def train_model():
  """Function to get the best model"""
  # load dataset
  X_train = joblib.load('data/output/X_train_clean.pkl')
  y_train = joblib.load('data/output/y_train_clean.pkl')
  X_valid = joblib.load('data/output/X_valid_clean.pkl')
  y_valid = joblib.load('data/output/y_valid_clean.pkl')
  
  # create list of params & models
  list_of_param = create_model_param()
  list_of_model = create_model_object()
  
  # list of trained model
  list_of_tuned_model = {}
  
  # train model
  for base_model in list_of_model:
    # current condition
    model_name = base_model['model_name']
    model_obj = copy.deepcopy(base_model['model_object'])
    model_param = list_of_param[model_name]
    
    # debug message
    print('Training model :', model_name)
    
    # create model object
    model = RandomizedSearchCV(estimator=model_obj,
                               param_distributions=model_param,
                               n_iter=5,
                               cv=5,
                               random_state=123,
                               n_jobs=1,
                               verbose=10,
                               scoring='roc_auc')
    
    # train model
    model.fit(X_train, y_train)
    
    # predict
    y_pred_proba_train = model.predict_proba(X_train)[:, 1]
    y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
    
    # get score
    train_score = roc_auc_score(y_train, y_pred_proba_train)
    valid_score = roc_auc_score(y_valid, y_pred_proba_valid)
    
    # append 
    list_of_tuned_model[model_name] = {
      'model': model,
      'train_auc': train_score,
      'valid_auc': valid_score,
      'best_params': model.best_params_
    }
    
    print('Done training')
    print('')
  
  joblib.dump(list_of_param, 'data/model/list_of_param.pkl')
  joblib.dump(list_of_model, 'data/model/list_of_model.pkl')
  joblib.dump(list_of_tuned_model, 'data/model/list_of_tuned_model.pkl')
  
  return list_of_param, list_of_model, list_of_tuned_model
  
    

In [None]:
# running the train model function
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : KNeighborsClassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_neighbors=50..............................................




[CV 1/5; 1/3] END ...............n_neighbors=50;, score=0.858 total time=   0.2s
[CV 2/5; 1/3] START n_neighbors=50..............................................
[CV 2/5; 1/3] END ...............n_neighbors=50;, score=0.853 total time=   0.1s
[CV 3/5; 1/3] START n_neighbors=50..............................................
[CV 3/5; 1/3] END ...............n_neighbors=50;, score=0.846 total time=   0.2s
[CV 4/5; 1/3] START n_neighbors=50..............................................
[CV 4/5; 1/3] END ...............n_neighbors=50;, score=0.851 total time=   0.2s
[CV 5/5; 1/3] START n_neighbors=50..............................................
[CV 5/5; 1/3] END ...............n_neighbors=50;, score=0.856 total time=   0.3s
[CV 1/5; 2/3] START n_neighbors=100.............................................
[CV 1/5; 2/3] END ..............n_neighbors=100;, score=0.858 total time=   0.3s
[CV 2/5; 2/3] START n_neighbors=100.............................................
[CV 2/5; 2/3] END ..........



[CV 1/5; 1/4] END ...............n_estimators=5;, score=0.857 total time=   1.2s
[CV 2/5; 1/4] START n_estimators=5..............................................
[CV 2/5; 1/4] END ...............n_estimators=5;, score=0.851 total time=   0.0s
[CV 3/5; 1/4] START n_estimators=5..............................................
[CV 3/5; 1/4] END ...............n_estimators=5;, score=0.846 total time=   0.0s
[CV 4/5; 1/4] START n_estimators=5..............................................
[CV 4/5; 1/4] END ...............n_estimators=5;, score=0.849 total time=   0.0s
[CV 5/5; 1/4] START n_estimators=5..............................................
[CV 5/5; 1/4] END ...............n_estimators=5;, score=0.850 total time=   0.0s
[CV 1/5; 2/4] START n_estimators=10.............................................
[CV 1/5; 2/4] END ..............n_estimators=10;, score=0.859 total time=   0.0s
[CV 2/5; 2/4] START n_estimators=10.............................................
[CV 2/5; 2/4] END ..........

In [6]:
# show list of tuned model
list_of_tuned_model

{'KNeighborsClassifier': {'model': RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_iter=5, n_jobs=1,
                     param_distributions={'n_neighbors': [50, 100, 200]},
                     random_state=123, scoring='roc_auc', verbose=10),
  'train_auc': 0.8602548044426367,
  'valid_auc': 0.8628637399439292,
  'best_params': {'n_neighbors': 100}},
 'LogisticRegression': {'model': RandomizedSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
                     n_iter=5, n_jobs=1,
                     param_distributions={'C': [0.01, 0.1],
                                          'max_iter': [100, 300, 500],
                                          'penalty': ['l1', 'l2']},
                     random_state=123, scoring='roc_auc', verbose=10),
  'train_auc': 0.8526284453004676,
  'valid_auc': 0.8550904112115946,
  'best_params': {'penalty': 'l2', 'max_iter': 500, 'C': 0.01}},
 'XGBClassifier': {'model': RandomizedSearchCV(cv=5,
                     est

Get the best model

In [7]:
# get best model function
def get_best_model():
  """Function to get the best model"""
  # load tuned model
  list_of_tuned_model = joblib.load('data/model/list_of_tuned_model.pkl')
  
  # get the best model
  best_model_name = None
  best_model = None
  best_performance = -99999
  best_model_param = None
  
  for model_name, model in list_of_tuned_model.items():
    if model['valid_auc'] > best_performance:
      best_model_name = model_name
      best_model = model['model']
      best_performance = model['valid_auc']
      best_model_param = model['best_params']
      
  # save the best model
  joblib.dump(best_model, 'data/model/best_model.pkl')
  
  # print
  print('=============================================')
  print('Best model         :', best_model_name)
  print('Metric score       :', best_performance)
  print('Best model params  :', best_model_param)
  print('=============================================')
  
  return best_model
  

In [8]:
# show the result of the best model and parameters
best_model = get_best_model()

Best model         : XGBClassifier
Metric score       : 0.8646928753961602
Best model params  : {'n_estimators': 10}
