In [1]:
# !pip install factor_analyzer
import six
import sys
sys.modules['sklearn.externals.six'] = six
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, homogeneity_score
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from factor_analyzer import FactorAnalyzer

from sklearn.metrics import roc_curve,roc_auc_score, auc, f1_score, accuracy_score, recall_score
import pandas as pd
import numpy as np
from sklearn import tree
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import seaborn
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from statistics import mean
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from time import process_time
import itertools 

In [None]:
def train_valid_test_split(df, target = 'fraud_bool'):
    # split df to train set and test set
    from sklearn.model_selection import train_test_split
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                        train_size = 0.8, random_state=0)
    
    
    # split train set to train set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify = y_train, 
                                                    train_size = 0.8, random_state=0)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

def train_valid_test_split2(df, target = 'Segmentation'):
    # split df to train set and test set
    from sklearn.model_selection import train_test_split
    X = df.drop(target, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                        train_size = 0.8, random_state=0)
    
    
    # split train set to train set and validation set
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, stratify = y_train, 
                                                    train_size = 0.8, random_state=0)
    
    return X_train, y_train, X_valid, y_valid, X_test, y_test

# Feature engineering

In [None]:
# knn impute numeric columns and one hot encode category columns
def ohe(X_train, X_valid, X_test):
    # impute numeric columns
    numcols = X_train.select_dtypes(include=['float','int']).columns
    imputer = KNNImputer(n_neighbors=3, weights="uniform")
    imputer.fit(X_train[numcols])
    X_train[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    X_valid[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    X_test[numcols] = pd.DataFrame(imputer.transform(X_train[numcols]), columns=numcols)
    
    # impute categorical columns
    ohe_training_predictors = pd.get_dummies(X_train)
    ohe_valid_predictors = pd.get_dummies(X_valid)
    ohe_test_predictors = pd.get_dummies(X_test)
    X_train, X_valid = ohe_training_predictors.align(
        ohe_valid_predictors,join='left', axis=1)

    X_train, X_test = ohe_training_predictors.align(
        ohe_test_predictors,join='left', axis=1)
    
    X_train=X_train.fillna(0)
    X_valid=X_valid.fillna(0)
    X_test=X_test.fillna(0)
    
    return X_train, X_valid, X_test

# X_train, X_valid, X_test = ohe(X_train, X_valid, X_test)

In [None]:
def std_scale(X_train, X_valid, X_test):
    sc = StandardScaler()
    fit = sc.fit(X_train)
    cols = X_train.columns
    X_train = pd.DataFrame(fit.transform(X_train),columns=cols)
    X_valid = pd.DataFrame(fit.transform(X_valid),columns=cols)
    X_test = pd.DataFrame(fit.transform(X_test),columns=cols)
    
    return X_train, X_valid, X_test

In [None]:
def drop_catcols(X_train, X_valid, X_test):
    catcols = X_train.select_dtypes(include=['object']).columns
    X_train.drop(columns = catcols, inplace=True)
    X_valid.drop(columns = catcols, inplace=True)
    X_test.drop(columns = catcols, inplace=True)
    
    return X_train, X_valid, X_test

In [None]:
def oversampling(X_train, y_train):
    # over-sampling: match the minority class to the majority class
    from collections import Counter
    from imblearn.over_sampling import (RandomOverSampler, SMOTE, ADASYN)
    sampler = RandomOverSampler(sampling_strategy='auto',random_state=0)
    X_train_rs, y_train_rs = sampler.fit_resample(X_train, y_train)
#     print('Before resample {}'.format(Counter(y_train)),' RandomOverSampler {}'.format(Counter(y_train_rs)))
    
    return X_train_rs, y_train_rs

# modelling

In [None]:
def model(model, X_train, y_train, X_valid, y_valid):
    t_start = process_time()
    model = model.fit(X_train, y_train)
    # predict
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_valid)
    t_stop = process_time()
    t = t_stop - t_start
    
    # calculate score
    from sklearn.metrics import roc_curve,roc_auc_score, auc, f1_score, accuracy_score, recall_score
    train_roc = roc_auc_score(y_train,train_pred)
    cv_score = cross_val_score(model, X_train, y_train, cv=5)
    test_roc = roc_auc_score(y_valid,test_pred)
    f1_score = f1_score(y_valid, test_pred, average='weighted')
    acc = accuracy_score(y_valid, test_pred)
    recall_score = recall_score(y_valid, test_pred, average='weighted')
    
    perf = pd.DataFrame({'acc':[round(acc,3)],
                         'cv_score':[round(mean(cv_score),3)],
                         'train_roc':[round(train_roc,3)],                
                         'test_roc':[round(test_roc,3)],
                         'recall_score':[round(recall_score,3)],
                         'f1_score':[round(f1_score,3)],
                         'run_time':[round(t,5)]
                        })
    return perf

In [None]:
def model2(model, X_train, y_train, X_valid, y_valid):
    t_start = process_time()
    model = model.fit(X_train, y_train)
    # predict
    train_pred = pd.DataFrame(model.predict(X_train))
    test_pred = pd.DataFrame(model.predict(X_valid))
    train_pred_prob = model.predict_proba(X_train)
    test_pred_prob = model.predict_proba(X_valid)
    t_stop = process_time()
    t = t_stop - t_start
    # calculate score
    from sklearn.metrics import roc_curve,roc_auc_score, auc, f1_score, accuracy_score, recall_score
    train_roc = roc_auc_score(y_train,train_pred_prob, multi_class = 'ovr', average = 'weighted')
    cv_score = cross_val_score(model, X_train, y_train, cv=5)
    test_roc = roc_auc_score(y_valid,test_pred_prob, multi_class = 'ovr', average = 'weighted')
    f1_score = f1_score(y_valid, test_pred, average='weighted')
    acc = accuracy_score(y_valid, test_pred)
    recall_score = recall_score(y_valid, test_pred, average='weighted')
    
    perf = pd.DataFrame({'acc':[round(acc,3)],
                         'cv_score':[round(mean(cv_score),3)],
                         'train_roc':[round(train_roc,3)],                
                         'test_roc':[round(test_roc,3)],
                         'recall_score':[round(recall_score,3)],
                         'f1_score':[round(f1_score,3)],
                         'run_time':[round(t,3)]
                        })
    return perf

# Graph functions

In [None]:
def learning_curve_graph(perf_out):
    plt.plot(perf_out['train_ratio'], perf_out['recall_score'], label = "recall_score")
    plt.plot(perf_out['train_ratio'], perf_out['train_roc'], label = "train_roc")
    plt.plot(perf_out['train_ratio'], perf_out['test_roc'], label = "test_roc")
    plt.plot(perf_out['train_ratio'], perf_out['f1_score'], label = "f1_score")
    # plt.plot(le_perf_out['train_ratio'], perf_out['run_time'], label = "run_time")
    plt.xlabel("train data size")
    plt.ylabel("score")
    plt.legend()
    plt.show()

In [None]:
def importance_graph(clf, X_train):
    importances = pd.DataFrame({'features': X_train.columns, 'importance': np.round(clf.feature_importances_,3)})
    importances = importances.sort_values('importance',ascending=False)
    importances.sort_values('importance', ascending=False)
    import seaborn as sns
    sns.set(rc={'figure.figsize':(25,15)})
    sns.barplot(x='importance', y = 'features', data = importances[:10])

In [None]:
def tree_graph(clf, X_train, y_train):
    fig = plt.figure(figsize=(25,20))
    clf.fit
    _ = tree.plot_tree(clf, 
                       feature_names=X_train.columns,  
                       class_names=y_train.unique().astype(str),
                       filled=True)

In [None]:
def plot_kmeans(X_train):
    silhouette_scores = []
    inertia_scores = []

    # Loop through each k value 
    k_range = range(1, 10)
    for k in k_range:
        model = KMeans(n_clusters=k, random_state=0)
        cluster_labels = model.fit_predict(X_train)
    #     silhouette_avg = silhouette_score(X_train, cluster_labels)
        inertia = model.inertia_
    #     silhouette_scores.append(silhouette_avg)
        inertia_scores.append(inertia)

    k_range2 = range(2, 10)
    for k in k_range2:
        model = KMeans(n_clusters=k, random_state=0)
        cluster_labels = model.fit_predict(X_train)
        silhouette_avg = silhouette_score(X_train, cluster_labels)
    #     inertia = model.inertia_
        silhouette_scores.append(silhouette_avg)
    #     inertia_scores.append(inertia)

    # Plot the scores for each k value
    fig, ax1 = plt.subplots(figsize=(8, 6))
    color = 'blue'
    ax1.set_xlabel('Number of clusters')
    ax1.set_ylabel('Silhouette Score', color=color)
    ax1.plot(k_range2, silhouette_scores, label='Silhouette Score', color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()
    color = 'red'
    ax2.set_ylabel('SSE', color=color)
    ax2.plot(k_range, inertia_scores, label='SSE', color=color)
    ax2.tick_params(axis='y', labelcolor=color)

    ax1.legend(loc='lower left')
    ax2.legend(loc='upper right')

    fig.tight_layout()
    plt.show()

In [None]:
def plot_cluster_label(cluster_labels, label):
    cl_op = pd.DataFrame({'label':label, 'cluster':cluster_labels})
    op = cl_op.groupby('cluster')['label'].value_counts().unstack(fill_value=0)
    print(op)
    ax = op.plot(kind='bar', stacked=True, color=['blue', 'red', 'orange','green', 'black', 'yellow',  'pink'])
    ax.set_xlabel('cluster result')
    ax.set_ylabel('label')
    plt.show()

In [2]:
def plot_em(X_train):
    silhouette_scores = []
    k_range2 = range(2, 8)
    for k in k_range2:
        model = GaussianMixture(n_components=k).fit(X_train)
        cluster_labels = model.fit_predict(X_train)
        silhouette_avg = silhouette_score(X_train, cluster_labels)
    #     inertia = model.inertia_
        silhouette_scores.append(silhouette_avg)
        
    fig, ax1 = plt.subplots(figsize=(8, 6))
    color = 'blue'
    ax1.set_xlabel('Number of clusters')
    ax1.set_ylabel('Silhouette Score', color=color)
    ax1.plot(k_range2, silhouette_scores, label='Silhouette Score', color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    
    ax1.legend(loc='lower left')

    fig.tight_layout()
    plt.show()