In [1]:
import pandas as pd
import numpy as np
import copy

import src.utils as utils

# Load Config File

In [2]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

{'raw_dataset_path': 'data/raw/data.csv',
 'data_set_path': 'data/output/data.pkl',
 'input_set_path': 'data/output/input.pkl',
 'output_set_path': 'data/output/output.pkl',
 'input_columns_path': 'data/output/input_columns.pkl',
 'train_set_path': ['data/output/X_train.pkl', 'data/output/y_train.pkl'],
 'valid_set_path': ['data/output/X_valid.pkl', 'data/output/y_valid.pkl'],
 'test_set_path': ['data/output/X_test.pkl', 'data/output/y_test.pkl'],
 'index_column': 'Unnamed: 0',
 'output_column': 'SeriousDlqin2yrs',
 'seed': 42,
 'test_size': 0.2,
 'clean_late_col': 'NumberOfTimes90DaysLate',
 'clean_late_val': 96,
 'clean_unsecure_col': 'RevolvingUtilizationOfUnsecuredLines',
 'constant_imputer_col': 'NumberOfDependents',
 'constant_imputer_path': 'data/output/constant_imputer.pkl',
 'constant_imputer_val': 0.0,
 'median_imputer_col': 'MonthlyIncome',
 'median_imputer_path': 'data/output/median_imputer.pkl',
 'standardizer_path': 'data/output/standardizer.pkl',
 'preprocessor_path': 'd

# Create Model

Model of Choice
- KNN
- Decision Tree
- Logistic Regression
- Random Forest
- XGBoost

Define params

In [3]:
def create_model_param():
    """Create the model objects"""
    lgr_params = {
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    xgb_params = {
        'n_estimators': [50, 100, 200]
    }

    # Create model params
    list_of_param = {
        'LogisticRegression': lgr_params,
        'XGBClassifier': xgb_params
    }

    return list_of_param


Define Models

In [4]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [5]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    lgr = LogisticRegression()
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': lgr.__class__.__name__, 'model_object': lgr},
        {'model_name': xgb.__class__.__name__, 'model_object': xgb}
    ]

    return list_of_model


Do the cross validation

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [7]:
def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv = 5,
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict
        y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        # Get score
        train_score = roc_auc_score(y_train, y_pred_proba_train)
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    


In [8]:
list_of_param, list_of_model, list_of_tuned_model = train_model()

Creating model objects
Training model : LogisticRegression
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=0.01, max_iter=500, penalty=l2............................
[CV 1/5; 1/5] END C=0.01, max_iter=500, penalty=l2;, score=0.857 total time=   0.1s
[CV 2/5; 1/5] START C=0.01, max_iter=500, penalty=l2............................
[CV 2/5; 1/5] END C=0.01, max_iter=500, penalty=l2;, score=0.854 total time=   0.0s
[CV 3/5; 1/5] START C=0.01, max_iter=500, penalty=l2............................
[CV 3/5; 1/5] END C=0.01, max_iter=500, penalty=l2;, score=0.847 total time=   0.0s
[CV 4/5; 1/5] START C=0.01, max_iter=500, penalty=l2............................
[CV 4/5; 1/5] END C=0.01, max_iter=500, penalty=l2;, score=0.849 total time=   0.0s
[CV 5/5; 1/5] START C=0.01, max_iter=500, penalty=l2............................


[CV 5/5; 1/5] END C=0.01, max_iter=500, penalty=l2;, score=0.854 total time=   0.0s
[CV 1/5; 2/5] START C=0.01, max_iter=100, penalty=l1............................
[CV 1/5; 2/5] END C=0.01, max_iter=100, penalty=l1;, score=nan total time=   0.0s
[CV 2/5; 2/5] START C=0.01, max_iter=100, penalty=l1............................
[CV 2/5; 2/5] END C=0.01, max_iter=100, penalty=l1;, score=nan total time=   0.0s
[CV 3/5; 2/5] START C=0.01, max_iter=100, penalty=l1............................
[CV 3/5; 2/5] END C=0.01, max_iter=100, penalty=l1;, score=nan total time=   0.0s
[CV 4/5; 2/5] START C=0.01, max_iter=100, penalty=l1............................
[CV 4/5; 2/5] END C=0.01, max_iter=100, penalty=l1;, score=nan total time=   0.0s
[CV 5/5; 2/5] START C=0.01, max_iter=100, penalty=l1............................
[CV 5/5; 2/5] END C=0.01, max_iter=100, penalty=l1;, score=nan total time=   0.0s
[CV 1/5; 3/5] START C=0.01, max_iter=500, penalty=l1............................
[CV 1/5; 3/5] END C=

15 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/home/cahya_pacmann/anaconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/cahya_pacmann/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/cahya_pacmann/anaconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties

Done training

Training model : XGBClassifier
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5; 1/3] START n_estimators=50.............................................
[CV 1/5; 1/3] END ..............n_estimators=50;, score=0.855 total time=   0.4s
[CV 2/5; 1/3] START n_estimators=50.............................................
[CV 2/5; 1/3] END ..............n_estimators=50;, score=0.845 total time=   0.4s
[CV 3/5; 1/3] START n_estimators=50.............................................
[CV 3/5; 1/3] END ..............n_estimators=50;, score=0.845 total time=   0.3s
[CV 4/5; 1/3] START n_estimators=50.............................................
[CV 4/5; 1/3] END ..............n_estimators=50;, score=0.845 total time=   0.3s
[CV 5/5; 1/3] START n_estimators=50.............................................
[CV 5/5; 1/3] END ..............n_estimators=50;, score=0.850 total time=   0.4s
[CV 1/5; 2/3] START n_estimators=100............................................
[CV

In [9]:
list_of_tuned_model

{'LogisticRegression': {'model': RandomizedSearchCV(cv=5, estimator=LogisticRegression(), n_iter=5, n_jobs=1,
                     param_distributions={'C': [0.01, 0.1],
                                          'max_iter': [100, 300, 500],
                                          'penalty': ['l1', 'l2']},
                     random_state=123, scoring='roc_auc', verbose=10),
  'train_auc': 0.8525712625980069,
  'valid_auc': 0.8551535886466674,
  'best_params': {'penalty': 'l2', 'max_iter': 300, 'C': 0.1}},
 'XGBClassifier': {'model': RandomizedSearchCV(cv=5,
                     estimator=XGBClassifier(base_score=None, booster=None,
                                             callbacks=None,
                                             colsample_bylevel=None,
                                             colsample_bynode=None,
                                             colsample_bytree=None,
                                             early_stopping_rounds=None,
                  

Get the best model

In [10]:
def get_best_model(return_file=True):
    """Function to get the best model"""
    # Load tuned model
    list_of_tuned_model = utils.pickle_load(CONFIG_DATA['list_of_tuned_model_path'])

    # Get the best model
    best_model_name = None
    best_model = None
    best_performance = -99999
    best_model_param = None

    for model_name, model in list_of_tuned_model.items():
        if model['valid_auc'] > best_performance:
            best_model_name = model_name
            best_model = model['model']
            best_performance = model['valid_auc']
            best_model_param = model['best_params']

    # Dump the best model
    utils.pickle_dump(best_model, CONFIG_DATA['best_model_path'])

    # Print
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Metric score      :', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    if return_file:
        return best_model


In [11]:
best_model = get_best_model()

Best model        : XGBClassifier
Metric score      : 0.8605813878486432
Best model params : {'n_estimators': 50}


In [12]:
best_model

Tune the threshold

In [13]:
from sklearn.metrics import f1_score

In [14]:
THRESHOLD = np.linspace(0, 1, 100)

In [15]:
def get_best_threshold(return_file=True):
    """Function to tune & get the best decision threshold"""
    # Load data & model
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    best_model = utils.pickle_load(CONFIG_DATA['best_model_path'])

    # Get the proba pred
    y_pred_proba = best_model.predict_proba(X_valid)[:, 1]

    # Initialize
    metric_threshold = pd.Series([])
    
    # Optimize
    for threshold_value in THRESHOLD:
        # Get predictions
        y_pred = (y_pred_proba >= threshold_value).astype(int)

        # Get the F1 score
        metric_score = f1_score(y_valid, y_pred, average='macro')

        # Add to the storage
        metric_threshold[metric_score] = threshold_value

    # Find the threshold @max metric score
    metric_score_max_index = metric_threshold.index.max()
    best_threshold = metric_threshold[metric_score_max_index]
    print('=============================================')
    print('Best threshold :', best_threshold)
    print('Metric score   :', metric_score_max_index)
    print('=============================================')
    
    # Dump file
    utils.pickle_dump(best_threshold, CONFIG_DATA['best_threshold_path'])

    if return_file:
        return best_threshold


In [16]:
get_best_threshold()

  metric_threshold = pd.Series([])


Best threshold : 0.8585858585858587
Metric score   : 0.695584290568394


0.8585858585858587