# k-means Subsegment Models + Resampling

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import networkx as nx
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from imblearn.pipeline import Pipeline as pipe_imb
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy
from sklearn import compose
from sklearn import neighbors
from sklearn import decomposition
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [None]:
# Create a data frame to store the results
def print_results(headline, true_value, pred, probs):
    scores=[]
    CM = confusion_matrix(true_value, pred)
    scores.append(headline)
    scores.append(accuracy_score(true_value, pred))      #accuracy
    scores.append(int(CM[1,1]))                          #TP
    scores.append(int(CM[0,1]))                          #FP
    scores.append(int(CM[0][0]))                         #TN
    scores.append(int(CM[1][0]))                         #FN
    scores.append(precision_score(true_value, pred))     #precision
    scores.append(recall_score(true_value, pred))        #recall
    scores.append(roc_auc_score(true_value, probs))      #roc_auc
    p, r, _ = precision_recall_curve(true_value, probs) 
    scores.append(auc(r,p))                              #pr_auc
    scores.append(f1_score(true_value, pred, average="macro"))            #f1-score
    return scores

score_names = ['method','accuracy','TP','FP','TN','FN','precision','recall','roc_auc','pr_auc','f1']
dfAcc = dfAcc = pd.DataFrame(data=np.zeros(shape=(0,11)), columns = score_names)

In [None]:
dfAcc

# Import data subsets

In [None]:
import pickle
fd = open("D5_datasets_kmeans", 'rb') 
clu_datalist = pickle.load(fd)
fd.close()

In [None]:
X4_0 = clu_datalist[0]
X4_1 = clu_datalist[1]
X4_2 = clu_datalist[2]
X4_3 = clu_datalist[3]

y4_0 = clu_datalist[4]
y4_1 = clu_datalist[5]
y4_2 = clu_datalist[6]
y4_3 = clu_datalist[7]

## Prediction by using cluster-based data subsets

In [None]:
# Test train split for each subset

X4_0_train, X4_0_test, y4_0_train, y4_0_test = train_test_split(X4_0, y4_0, test_size=0.33, random_state=42)
X4_1_train, X4_1_test, y4_1_train, y4_1_test = train_test_split(X4_1, y4_1, test_size=0.33, random_state=42)
X4_2_train, X4_2_test, y4_2_train, y4_2_test = train_test_split(X4_2, y4_2, test_size=0.33, random_state=42)
X4_3_train, X4_3_test, y4_3_train, y4_3_test = train_test_split(X4_3, y4_3, test_size=0.33, random_state=42)


# for sake of efficiency, create a matrix

cluster_train_test_matrix = [[X4_0_train, X4_0_test, y4_0_train, y4_0_test],
                                [X4_1_train, X4_1_test, y4_1_train, y4_1_test],
                                [X4_2_train, X4_2_test, y4_2_train, y4_2_test],
                                [X4_3_train, X4_3_test, y4_3_train, y4_3_test]]

### Tune cluster 0 

In [None]:
# Tune

ovsmp_pipe = pipe_imb([    ('imputer'   , SimpleImputer(strategy="median")),
                           ('scaler'    , StandardScaler()),
                           ('sampler'   , RandomOverSampler(random_state=42,
                                                            sampling_strategy = 0.6)),
                           ('classifier', XGBClassifier(objective='binary:logistic',
                                                        eval_metric = f1_macro,
                                                        n_estimators=1000, 
                                                        eta=0.005, 
                                                        max_depth=3, 
                                                        subsample=0.5, 
                                                        min_child_weight=50, 
                                                       gamma=5, 
                                                        reg_lambda=0.5, 
                                                        alpha=6, 
                                                        colsample_bytree=0.5 
                                                       ))
                 ])

params = [{
          #  'sampler__sampling_strategy': [0.5,0.6,0.7,0.8,0.9],
          #  'classifier__n_estimators':[1000,1300,1500],
          #  'classifier__eta': [0.01,0.005,0.0001],
          #   'classifier__max_depth':[2,3,4],
         #  'classifier__min_child_weight': [50,100,150],
         #  'classifier__colsample_bytree':[0.5,0.7,1],
         #  'classifier__subsample' : [0.5,0.6,0.7,0.8,0.9],
         #  'classifier__alpha':[4,5,6], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__gamma':[4,5,6], #  defult 0. Increasing this value will make model more conservative.
         #   'classifier__reg_lambda':[0.5,1,1.5], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
# Run the grid search 


grid.fit(X4_0_train, y4_0_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X4_0_train, y4_0_train))
print('Test F1_macro :', grid.score(X4_0_test, y4_0_test))

In [None]:
# print results

pred_probs = grid.predict_proba(X4_0_test)[:,1]
scores = print_results('D5_XGB_km-clu0_oversmp', y4_0_test, grid.predict(X4_0_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

## Tune cluster 1 

In [None]:
# Tune

ovsmp_pipe = pipe_imb([  ('imputer'   , SimpleImputer(strategy="median")),
                         ('scaler'    , StandardScaler()),
                         ('sampler'   , RandomOverSampler(random_state=42,
                                                        sampling_strategy = 0.8)),
                         ('classifier', XGBClassifier(objective='binary:logistic',
                                                        eval_metric = f1_macro,
                                                        n_estimators=1300, 
                                                        eta=0.005, 
                                                        max_depth=5, 
                                                        subsample=0.9, 
                                                        min_child_weight=150, 
                                                        gamma=5, 
                                                        reg_lambda=1, 
                                                        alpha=5, 
                                                        colsample_bytree=0.6, 
                                                       ))
                 ])

params = [{
           # 'sampler__sampling_strategy': [0.5,0.6,0.7,0.8,0.9],
         #   'classifier__n_estimators':[1200,1300,1400],
          #  'classifier__eta': [0.01,0.05,0.1,0.005],
          #   'classifier__max_depth':[4,5,6],
          # 'classifier__min_child_weight': [125,150,175],
          # 'classifier__colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1],
        #   'classifier__subsample' : [0.5,0.6,0.7,0.8,0.9,1],
         #  'classifier__alpha':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__gamma':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
         #   'classifier__reg_lambda':[0.5,1,1.5], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
# Run the grid search 

grid.fit(X4_1_train, y4_1_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X4_1_train, y4_1_train))
print('Test F1_macro :', grid.score(X4_1_test, y4_1_test))

In [None]:
# print results

pred_probs = grid.predict_proba(X4_1_test)[:,1]
scores = print_results('D5_XGB_km-clu1_oversmp', y4_1_test, grid.predict(X4_1_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

## Tune cluster 2

In [None]:
# Tune

ovsmp_pipe = pipe_imb([      ('imputer'   , SimpleImputer(strategy="median")),
                       ('scaler'    , StandardScaler()),
                        ('sampler'   , RandomOverSampler(random_state=42,
                                                         sampling_strategy = 0.4)),
                       ('classifier', XGBClassifier(objective='binary:logistic',
                                                    eval_metric = f1_macro,
                                                    n_estimators=1300, 
                                                    eta=0.02, 
                                                    max_depth=3, 
                                                    subsample=0.9, 
                                                    min_child_weight=150, 
                                                    gamma=3, 
                                                    reg_lambda=1, 
                                                    alpha=5, 
                                                    colsample_bytree=0.7, 
                                                   ))
                 ])

params = [{
           # 'sampler__sampling_strategy': [0.3,0.4,0.5],
          #   'classifier__n_estimators':[1200,1300,1400],
          #  'classifier__eta': [0.01,0.02,0.005,0.0001],
          #   'classifier__max_depth':[3,5,7],
         #  'classifier__min_child_weight': [50,100,150],
          # 'classifier__colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1],
          # 'classifier__subsample' : [0.5,0.6,0.7,0.8,0.9,1],
          # 'classifier__alpha':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
        #    'classifier__gamma':[0,1,3,5], #  defult 0. Increasing this value will make model more conservative.
           # 'classifier__reg_lambda':[0,1,3,5], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
# Run the grid search 

grid.fit(X4_2_train, y4_2_train)
print(grid.best_estimator_,'\n') 
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X4_2_train, y4_2_train))
print('Test F1_macro :', grid.score(X4_2_test, y4_2_test))

In [None]:
# print results

pred_probs = grid.predict_proba(X4_2_test)[:,1]
scores = print_results('D5_XGB_km-clu2_oversmp', y4_2_test, grid.predict(X4_2_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

## Tune cluster 3

In [None]:
# Tune

ovsmp_pipe = pipe_imb([      ('imputer'   , SimpleImputer(strategy="median")),
                       ('scaler'    , StandardScaler()),
                      ('sampler'   , RandomOverSampler(random_state=42,
                                    sampling_strategy = 1)),
                       ('classifier', XGBClassifier(objective='binary:logistic',
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.005, 
                                                    max_depth=3, 
                                                    subsample=0.7, 
                                                    min_child_weight=100, 
                                                    gamma=5, 
                                                    reg_lambda=3, 
                                                    alpha=5, 
                                                    colsample_bytree=0.5, 
                                                   ))
                 ])

params = [{
           # 'sampler__sampling_strategy': [0,1],
          #  'classifier__n_estimators':[950,1000,1050],
           # 'classifier__eta': [0.01,0.1,0.05,0.005],
             'classifier__max_depth':[2],
         #  'classifier__min_child_weight': [50,100,150],
         #  'classifier__colsample_bytree':[0.5,0.6,0.7,0.8,0.9,1],
          # 'classifier__subsample' : [0.5,0.6,0.7,0.8,0.9,1],
          # 'classifier__alpha':[1,3,5], #  defult 0. Increasing this value will make model more conservative.
         #   'classifier__gamma':[0,1,3,5], #  defult 0. Increasing this value will make model more conservative.
         #   'classifier__reg_lambda':[2,3,4], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
# Run the grid search 

grid.fit(X4_3_train, y4_3_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X4_3_train, y4_3_train))
print('Test F1_macro :', grid.score(X4_3_test, y4_3_test))

In [None]:
"""
Best parameters  : {'classifier__colsample_bytree': 0.9, 'classifier__eta': 0.01, 'classifier__n_estimators': 1500, 'classifier__subsample': 0.9}

Training F1_macro: 0.8436718625554671
Test F1_macro : 0.812172183034907
    """

In [None]:
# print results

pred_probs = grid.predict_proba(X4_3_test)[:,1]
scores = print_results('D5_XGB_km-clu3_oversmp', y4_3_test, grid.predict(X4_3_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

## Permutation Importance

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(grid, X4_3_train, y4_3_train,
                               n_repeats=30,
                             random_state=42)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(r.importances);

In [None]:
r.importances_mean[9]

In [None]:
X4_3_train.columns[9]