In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# =============================================================================
#  Import Section: RANDOM FOREST
# =============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import pickle
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load Data Imports
import load_data as ld
from data_columns import *

In [None]:
# ==================================================================
# Parameters for Random Forest exploration
# ==================================================================
# Number of possible n_estimators to evaluate
n_n_estimators = 10
# Bounds for possible n_estimators values
n_estim_lowbound = 2
n_estim_upbound = 20

assert n_estim_upbound > n_estim_lowbound, "### ERROR: n_estim_upbound < n_estim_lowbound"
assert n_estim_lowbound < (n_estim_upbound - n_estim_lowbound), "### ERROR: n_estim_lowbound > (n_estim_upbound - n_estim_lowbound)"

# Number of possible tree depths to evaluate
n_max_depth = 10
# Bounds for possible n_estimators values
nmaxdepth_lowbound = 5
nmaxdepth_upbound = 40
assert nmaxdepth_upbound > nmaxdepth_lowbound, "### ERROR: nmaxdepth_upbound < nmaxdepth_lowbound"
assert n_max_depth < (nmaxdepth_upbound - nmaxdepth_lowbound), "### ERROR: n_max_depth > (nmaxdepth_upbound - nmaxdepth_lowbound)"

# Number of min samples to split tree
n_min_samples_split = 50

# Test-train set split size
split_size = 0.25
# Choose if to stratify the split or not
stratified = True

# Cross Validation K folds
n_Kfold_splits = 5

In [None]:
# Number of trees in the RF
n_estimators = np.arange(n_estim_lowbound, n_estim_upbound, step=round((n_estim_upbound - n_estim_lowbound)/n_n_estimators))
print("Possible number of trees: ", n_estimators)

# Max depth of tree
max_depth = np.arange(nmaxdepth_lowbound, nmaxdepth_upbound, step=round((nmaxdepth_upbound - nmaxdepth_lowbound)/n_max_depth))
print("Possible max depths of trees: ", max_depth)

# Exploration grid
rf_grid = [(i, j) for i in n_estimators.tolist() for j in max_depth.tolist()]

In [None]:
n_tot_cases = n_estimators.shape[0] * max_depth.shape[0]
print("A total of ", n_tot_cases, " cases will be evaluated.\n")

In [None]:
# Dictionary for storing best performance tree in the test set
rf_winner = {"logger_index": None,
             "mode_n_trees": 0,
             "mode_tree_depth": 0,
             "mode_auc_val": 0.0,
             "mode_auc_test": 0.0,
             "mode_rf_object": None}

mean_auc_val = 0.0

max_auc_test = 0.0

# Pandas dataFrame for logging
out_cols = ["mode_n_trees", "mode_tree_depth", "mode_cv_mean_auc", "mode_test_auc", "mode_image_path"]
pd_logger = pd.DataFrame(columns=out_cols)
logger_index = 0

# Directory for saving winner
out_path_winner = "./rf_output/"

# Directory for saving logger and plots
out_path_log = "./rf_output/log/"

In [None]:
# ==================================================================
# Load the data
# ==================================================================

df_train = ld.load_rf_trainer_data("./train_data", verbose=True)

In [None]:
df_train.isnull().any()

In [None]:
def split_test_train(in_df, xcols, ycol, filtercols, in_test_size):
    
    feat_cols = [x for x in xcols if not x in filtercols]
    
    X = in_df[feat_cols].values
    
    Y = in_df[ycol].values
    try: 
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=in_test_size, stratify=Y)
        stratified = True
    except ValueError:
        print("## NOTE: COULD NOT STRATIFY TARGET COL: too few samples ")
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=in_test_size)
        stratified = False
    return X_train, y_train, X_test, y_test, stratified

In [None]:
# ===============================================================
# Load and split the Data
# ===============================================================

# Lets first shuffle the rows in the dataFrame
df_train = df_train.sample(frac=1)
filter_cols = INNERJOIN_COLS + TARGET_COLS

Y_tot = df_train[TARGET_SEGMODE].values
n_classes_unique = np.unique(Y_tot)
print("SEGMENTS MODE unique classes: ", n_classes_unique)
n_classes = np.shape(n_classes_unique)[0]
print("Number of SEGMENTS MODE unique classes: ", n_classes)

Xmode_train, Ymode_train, Xmode_test, Ymode_test, stratified = split_test_train(df_train, RES_MERGED_COLS, 
                                                                                TARGET_SEGMODE, filter_cols, 
                                                                                split_size)

In [None]:
Ymode_train

In [None]:
# ===============================================================
# Build-Train-Test
# ===============================================================
t_start = time.time()

# Perform oversampling if needed
# X_smot, y_smot = preprocess.perform_smote(X_train, y_train)

for (estim, depth) in rf_grid:
    if stratified:
        cv = StratifiedKFold(n_splits=n_Kfold_splits)
    else:
        cv = KFold(n_splits=n_Kfold_splits)
    
    classifier =  OneVsRestClassifier(RandomForestClassifier(n_estimators=estim, max_depth=depth, 
                                                             min_samples_split=n_min_samples_split))

    mean_auc_cv = []

    cv_tracker = 0
    
    for train, test in cv.split(Xmode_train, Ymode_train):
    
        print("Starting CV fold ", cv_tracker)
        
        classifier.fit(Xmode_train[train], Ymode_train[train])
        
        y_score = classifier.predict_proba(Xmode_train[test])

        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        roc_auc_list = []
        
        for i in range(n_classes):

            i_class = n_classes_unique[i]

            Y_mode_class = np.asarray(Ymode_train[test] == i_class, dtype=np.float)

            fpr[i_class], tpr[i_class], _ = roc_curve(Y_mode_class, y_score[:, i])
            roc_auc[i_class] = auc(fpr[i_class], tpr[i_class])
            roc_auc_list.append(roc_auc[i_class])

        mean_auc_fold = np.mean(np.asarray(roc_auc_list))
        mean_auc_cv.append(mean_auc_fold)
        print("MODE mean ROC AUC on val set ", cv_tracker, " :", mean_auc_fold)
        cv_tracker += 1

    mean_auc_TRAIN = np.mean(np.asarray(mean_auc_cv))

    print("### MODE mean ROC AUC in CROSS validated set: ", mean_auc_TRAIN, "\n")
    print("### MODE evaluating test set ... ")
    y_test_score = classifier.predict_proba(Xmode_test)

    fpr_test = dict()
    tpr_test = dict()
    roc_auc_test = dict()

    roc_auc_list_test = []

    for i in range(n_classes):
        i_class = n_classes_unique[i]

        Y_test_class = np.asarray(Ymode_test == i_class, dtype=np.float)

        fpr_test[i_class], tpr_test[i_class], _ = roc_curve(Y_test_class, y_test_score[:, i])
        roc_auc_test[i_class] = auc(fpr_test[i_class], tpr_test[i_class])
        roc_auc_list_test.append(roc_auc_test[i_class])

    tmp_test_auc = np.asarray(roc_auc_list_test)
    tmp_test_auc = np.nan_to_num(tmp_test_auc)

    mean_auc_fold_test = np.mean(tmp_test_auc)

    if mean_auc_fold_test > max_auc_test:

        max_auc_test = mean_auc_fold_test

        print("### MODE MAXIMUM AUC TEST FOUND: ", mean_auc_fold_test)
        print("### MODE Number of trees: ", estim)
        print("### Max depth of trees: ", depth)

        rf_winner["logger_index"] = logger_index
        rf_winner["mode_n_trees"] = estim
        rf_winner["mode_tree_depth"] = depth
        rf_winner["mode_auc_test"] = mean_auc_fold_test
        rf_winner["mode_auc_val"] = mean_auc_fold
        rf_winner["mode_rf_object"] = classifier

        # Save winner model
        pickle_out = "./rf_output/rf_winner.pkl"
        with open(pickle_out, 'wb') as file:
            pickle.dump(rf_winner, file, protocol=pickle.HIGHEST_PROTOCOL)

        # for i, color in zip(range(n_classes), colors):
        for i in range(n_classes):
            i_class = n_classes_unique[i]
            # color = colors[i]
            plt.plot(fpr_test[i_class], tpr_test[i_class], #color=color, #lw=lw,
                     label='ROC curve of class {0} (area = {1:0.2f})'
                           ''.format(i_class, roc_auc_test[i_class]))
            print("# Best test AUC for class ", i_class, ": ", roc_auc_test[i_class])
        out_file = "./rf_output/rf_best_auc.png"
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate', fontsize=14)
        plt.ylabel('True Positive Rate', fontsize=14)
        plt.title('ROC for RF multi-class', fontsize=16)
        plt.savefig(out_file, format="png")
        # plt.show(block=True)
        
    pd_logger.loc[logger_index] = {"mode_n_trees": estim,
                                   "mode_tree_depth": depth,
                                   "mode_cv_mean_auc": mean_auc_TRAIN,
                                   "mode_test_auc": mean_auc_fold_test,
                                   "mode_image_path": out_file}
    # Save log table
    pd_logger.to_csv("./rf_output/log.csv")
        
    print("######################## Model iteration ", logger_index,
          " finished ######################### \n\n")
    logger_index += 1
    
print("FINISHED TRAINING")
print("TIME TO COMPLETE ", n_tot_cases, " CV CASES: ", (time.time() - t_start)/60, " sec")
print("Best performing RF combination:")
print("logger_index: ", rf_winner["logger_index"])
print("mode_n_trees", rf_winner["mode_n_trees"])
print("mode_tree_depth", rf_winner["mode_tree_depth"])
print("mode_auc_test", rf_winner["mode_auc_test"])
print("mode_auc_val", rf_winner["mode_auc_val"])