# Smote Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imb_Pipeline
from sklearn.model_selection import GridSearchCV
import time

In [None]:
def smote_params(X_train, y_train, X_test, y_test, RatioList, ScoringMethod, TimeIt=True):
    """
    Returns the classifiers dictionary updated with 4 enties related to 
        SMOTE optimum parameter analysis. 
        SMOTE_grid - DataFrame containing smote analysis results for plotting
        SMOTE_bestpart - 
        SMOTE_CVtrain_score - best search fit cross validate train score
        SMOTE_test_score - best search fit test score
    
    Parameters
    ----------
    clf - string, classifier to be analyzed
    classifiers - dictionary
    X_train, X_test - dataframes
    y_train, y_test - dataSeries
    RatioList - list, containing minority oversampling ratio values to test
    ScoringMethod - string, metric to be optimized
    TimeIt - boolean, flag for time processing time
    
    Returns
    -------
    Classifiers: dictionay
    """
    start_time = time.time()
    steps = [('scaler', StandardScaler()),
             ('SMOTE', SMOTE(random_state=SEED, n_jobs=CPU)),
             ('Classifier', RandomForestClassifier()]        
    pipeline = imb_Pipeline(steps)
    parameters = {'SMOTE__sampling_strategy': RatioList}
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    
    searcher = GridSearchCV(pipeline, param_grid=parameters, cv=kf, scoring=ScoringMethod)
    searcher.fit(X_train, y_train)
    
    df_gridCV = pd.DataFrame({'score': searcher.cv_results_['mean_test_score'],
                              'ResampleRatio': RatioList })
    
    grid = df_gridCV
    bestparm = searcher.best_params_['SMOTE__sampling_strategy']
    CVtrain_score = searcher.best_score_
    test_score = searcher.score(X_test, y_test)
    if TimeIt:
        t = time.time() - start_time
        print(f"{t:.0f} seconds execution time for {clf} classifier")
 
    return grid, bestparm, CVtrain_score, test_score

In [None]:
grid, bestparm, CVtrain_score, test_score = smote_params

def print_smote_parms():
    print(f"Best CV params {bestparm}")
    print(f"Best CV value using training dataset: {CVtrain_score:.3f}")
    print(f"Best grid search fit using testing dataset: {test_score:.3f}\n")   

def plot_smote_parms(ax):
    df_gridCV = SMOTE_grid 
    df_gridCV.plot(ax=ax, x='ResampleRatio');
    ax.set_title(f"{SMOTE Recall Scoring")
    ax.set_ylabel('Recall Score')

def plot_smote_parms_dashboard(clf_lst, classifiers):
    fig, axs = plt.subplots(1, len(clf_lst), figsize=(12,4), sharey=True)
    i = 0
    for clf in clf_lst:
        plot_smote_parms(axs[i])
        axs[i].plot(bestparm, 
                    CVtrain_score, 
                    'ko', label="Optimum")
        axs[i].legend(loc='lower right')
        i += 1
 #   fig.savefig("../images/smote_parms.png")



In [None]:
X_train, X_test, y_train, y_test, \
c_train, c_test, X_holdout, y_holdout, c_holdout, \
features = h.load_data(holdoutseed, engineered_features=False)
print('Positive Class frequency: {:.5f}'.format(y_train.mean()))

    RatioList = [0.002, 0.005, 0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.15, 0.20, 0.25]
    ScoringMethod='recall'
    clf_lst = ["LR", "RF", "XGB"]
    for clf in clf_lst:
        classifiers = smote_params(clf, classifiers, 
                                   X_train, y_train, 
                                   X_test, y_test, 
                                   RatioList, ScoringMethod, 
                                   TimeIt=True)

    update_pipeline(clf_lst, classifiers)
    h.save_classifier_dict(classifiers, "04")
    print_smote_parms(clf_lst, classifiers)
    plot_smote_parms_dashboard(clf_lst, classifiers)
