In [None]:
import pandas as pd
import numpy as np
import copy


import src.utils as utils

# Load Config File

In [None]:
CONFIG_DATA = utils.config_load()
CONFIG_DATA

# Create Model

Model of Choice
- KNN
- Logistic Regression
- Random Forest
- XGBoost

Define params

In [None]:
def create_model_param():
    """Create the model objects"""
    knn_params = {
        'n_neighbors': [50, 100, 200],
    }
    
    rf_params = {
        "n_estimators" : [i for i in range(50, 150, 30)],
        "min_samples_split" : [2, 4, 6, 8],
        "criterion" : ["gini", "entropy", "log_loss"]
    }

    lgr_params = {
        # 'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1],
        'max_iter': [100, 300, 500]
    }

    xgb_params = {
        'n_estimators': [5, 10, 25, 50]
    }

    # Create model params
    list_of_param = {
        'KNeighborsClassifier': knn_params,
        'RandomForestClassifier': rf_params,
        'LogisticRegression': lgr_params,
        'XGBClassifier': xgb_params
    }

    return list_of_param


Define Models

In [None]:
! pip install xgboost

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
def create_model_object():
    """Create the model objects"""
    print("Creating model objects")

    # Create model objects
    knn = KNeighborsClassifier()
    rf = RandomForestClassifier()
    lgr = LogisticRegression(solver='sag') # 
    xgb = XGBClassifier()

    # Create list of model
    list_of_model = [
        {'model_name': knn.__class__.__name__, 'model_object': knn},
        {'model_name': rf.__class__.__name__, 'model_object': rf},
        {'model_name': lgr.__class__.__name__, 'model_object': lgr},
        {'model_name': xgb.__class__.__name__, 'model_object': xgb}
    ]

    return list_of_model


Do the cross validation

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score


In [None]:
list_of_param = create_model_param()
list_of_model = create_model_object()

In [None]:
list_of_param #bentuk dict

In [None]:
def train_model(return_file=True):
    """Function to get the best model"""
    # Load dataset, training untuk dilatih, validation untuk membantu liat model yang palling optimum; milih model mana yang paling bagus. 
    # Data Test (data yang belum dilihat sama sekali) => ujian terakhir. katakanlah data in the future. 
    
    X_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][0])
    y_train = utils.pickle_load(CONFIG_DATA['train_clean_path'][1])
    X_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][0])
    y_valid = utils.pickle_load(CONFIG_DATA['valid_clean_path'][1])
    
    # Create list of params & models
    list_of_param = create_model_param()
    list_of_model = create_model_object()

    # List of trained model
    list_of_tuned_model = {}

    # Train model
    for base_model in list_of_model:
        # Current condition
        model_name = base_model['model_name']
        model_obj = copy.deepcopy(base_model['model_object'])
        model_param = list_of_param[model_name]

        # Debug message
        print('Training model :', model_name)

        # Create model object
        model = RandomizedSearchCV(estimator = model_obj,
                                   param_distributions = model_param,
                                   n_iter=5,
                                   cv = 5,
                                   random_state = 123,
                                   n_jobs=1,
                                   verbose=10,
                                   scoring = 'roc_auc')
        
        # Train model
        model.fit(X_train, y_train)

        # Predict, probability of legitimate transcation (0) and fraud (1)
        y_pred_proba_train = model.predict_proba(X_train)[:, 1]
        y_pred_proba_valid = model.predict_proba(X_valid)[:, 1]
        
        '''
        Get score, simpen skor training dan validation. untuk ngecek apakah ada overfitting. 
        misal: train:0.95 - validation: 0.45 [wah ini overfitting]
            '''

        train_score = roc_auc_score(y_train, y_pred_proba_train)
        valid_score = roc_auc_score(y_valid, y_pred_proba_valid)

        # Append
        list_of_tuned_model[model_name] = {
            'model': model,
            'train_auc': train_score,
            'valid_auc': valid_score,
            'best_params': model.best_params_
        }

        print("Done training")
        print("")

    # Dump data
    utils.pickle_dump(list_of_param, CONFIG_DATA['list_of_param_path'])
    utils.pickle_dump(list_of_model, CONFIG_DATA['list_of_model_path'])
    utils.pickle_dump(list_of_tuned_model, CONFIG_DATA['list_of_tuned_model_path'])

    if return_file:
        return list_of_param, list_of_model, list_of_tuned_model    


In [None]:
list_of_param, list_of_model, list_of_tuned_model = train_model()


## ini intinya inti

### Cara mengatasi overfitting:
1. Regularization
2. ss

In [None]:
list_of_tuned_model

Get the best model

In [None]:
def get_best_model(return_file=True):
    """Function to get the best model"""
    # Load tuned model
    list_of_tuned_model = utils.pickle_load(CONFIG_DATA['list_of_tuned_model_path'])

    # Get the best model
    best_model_name = None
    best_model = None
    best_performance = -99999
    best_model_param = None

    for model_name, model in list_of_tuned_model.items():
        if model['valid_auc'] > best_performance:
            best_model_name = model_name
            best_model = model['model']
            best_performance = model['valid_auc']
            best_model_param = model['best_params']

    # Dump the best model
    utils.pickle_dump(best_model, CONFIG_DATA['best_model_path'])

    # Print
    print('=============================================')
    print('Best model        :', best_model_name)
    print('Metric score      :', best_performance)
    print('Best model params :', best_model_param)
    print('=============================================')

    if return_file:
        return best_model

In [None]:
best_model = get_best_model()

'''
dari data validation
'''


## Prediction on test data

In [None]:
X_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][0])
y_test = utils.pickle_load(CONFIG_DATA['test_clean_path'][1])

In [None]:
# Predict
y_test_proba = best_model.predict_proba(X_test)[:, 1]

# Get score
score = roc_auc_score(y_test, y_test_proba)

In [None]:
score