# Resampling (SMOTE, Random Oversampling, Random Undersampling) on Network-Features-Enhanced-Dataset.
With / without PCA

In [None]:
import pandas as pd
import numpy as np
import pickle 
import matplotlib.pyplot as plt
import networkx as nx
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from imblearn.pipeline import Pipeline as pipe_imb
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import make_pipeline as make_pipeline_imb

def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [None]:
f = open('OM_D3_results_table', 'rb')
baseline_nx_table = pickle.load(f)
f.close()


f = open('OM_D3_Dataset_2_nx-features-added-brand_new', 'rb')
data_nx = pickle.load(f)
f.close()
print(data_nx.shape)


# Prediction

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, average_precision_score

In [None]:
# Create a data frame to store the results
def print_results(headline, true_value, pred, probs):
    scores=[]
    CM = confusion_matrix(true_value, pred)
    scores.append(headline)
    scores.append(accuracy_score(true_value, pred))      #accuracy
    scores.append(int(CM[1,1]))                          #TP
    scores.append(int(CM[0,1]))                          #FP
    scores.append(int(CM[0][0]))                         #TN
    scores.append(int(CM[1][0]))                         #FN
    scores.append(precision_score(true_value, pred))     #precision
    scores.append(recall_score(true_value, pred))        #recall
    scores.append(roc_auc_score(true_value, probs))      #roc_auc
    p, r, _ = precision_recall_curve(true_value, probs) 
    scores.append(auc(r,p))                              #pr_auc
    scores.append(f1_score(true_value, pred, average="macro"))            #f1-score
    return scores

score_names = ['method','accuracy','TP','FP','TN','FN','precision','recall','roc_auc','pr_auc','f1']
dfAcc = pd.DataFrame(data=np.zeros(shape=(0,11)), columns = score_names)

In [None]:
X= data_nx.drop(["CustomerID","label"],axis=1)
y= data_nx["label"]

In [None]:
cat_vars= ['EmailContactable','City', 'last_coupon_type_used','DeviceType']
X.drop(cat_vars,axis=1,inplace=True)
num_vars=list(X.columns)

# Train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
num_vars = list(X.columns)

In [None]:
"""pipe_cat = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("one_hot", OneHotEncoder(handle_unknown="ignore"))
])
"""
pipe_num = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

ct = ColumnTransformer([
    #("categorical_vars", pipe_cat, cat_vars),
    ("numeric_vars", pipe_num, num_vars),
    
], remainder="passthrough")

# Logistic Regression

In [None]:
ovsmp_pipe = pipe_imb([('ct'        , ct),
                 #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy = 0.6)),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )

params = [


            {'classifier__C'      :[0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga'],
        #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]},


          {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
     #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
          }]

         

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LoR_oversmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# D3_XGB

In [None]:
def f1_macro(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [None]:
# XGB

from xgboost import XGBClassifier

ovsmp_pipe = pipe_imb([('ct'        , ct),
                 #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy=0.5)),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=6, 
                                                    subsample=0.8, 
                                                    min_child_weight=25, 
                                                    gamma=1, 
                                                    reg_lambda=1, 
                                                    alpha=1, 
                                                    colsample_bytree=0.9, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )



params = [{
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                #     'dim_red__n_components': [8, 10, 13,15],
          #  'classifier__n_estimators':[750,1000,1500],
          # 'classifier__eta': [0.01,0.001,0.0001]
          #  'classifier__max_depth':[1],
          # 'classifier__min_child_weight': [200],
        #   'classifier__colsample_bytree':[0.5], #,0.6,0.7,0.8,0.9],
         #  'classifier__subsample' : [0.7],
         #  'classifier__alpha':[5], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__gamma':[5], #  defult 0. Increasing this value will make model more conservative.
           # 'classifier__reg_lambda':[5], #  def=1 .Increasing this value will make model more conservative.
         #   'classifier__scale_pos_weight' : [1]
}
         ]



# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_XGB_oversmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# LIGHTGBM / OVER

In [None]:
# LIGHTGBM

import lightgbm as lgb

ovsmp_pipe = pipe_imb([('ct'        , ct),            
                        #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomOverSampler(random_state=42,
                                                       sampling_strategy=0.5)),
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                   #  class_weight="balanced",                                                   
                                                     n_estimators=1000, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                      max_depth=6, 
                                                     num_leaves=25, 
                                                     min_child_samples=25,                                                      
                                                     reg_alpha=1, 
                                                     reg_lambda=1,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.8, 
                                                     subsample=0.7,
                                                     min_split_gain = 25,
                                                      min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )


param_grid = [{
                'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
                #     'dim_red__n_components': [8, 10, 13,15],
             #        'LGBM__n_estimators': [750,1000,1500], 
            #     'LGBM__learning_rate' : [0.0001,0.001,0.01],
                  'LGBM__min_child_weight': [60], #
                  'LGBM__max_depth' : [2],          #
         #        'LGBM__num_leaves': [500], 
              #   'LGBM__min_child_samples': [100,150,200,250,300],                                                      
              #   'LGBM__reg_alpha' : [1,1.5,2,2.5,3], # default 0 
                 'LGBM__min_data_in_leaf' : [50],
              #   'LGBM__reg_lambda' : [1,1.5,2,2.5,3], # default 0 
              #   'LGBM__subsample_freq' : [1,5,10,100,500], 
              # 'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
               'LGBM__colsample_bytree' : [0.1], 
            #    'LGBM__min_split_gain' : [50], #
},
         ]


# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(ovsmp_pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 2,
                    n_jobs = -1)
# default lgbm = 0.57

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1 score:', grid.score(X_train, y_train))
print("Test F1 score:", grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LGBM_oversmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# UNDERSAMPLING / LOR¶

In [None]:
unsmp_pipe = pipe_imb([('ct'        , ct),
                  #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomUnderSampler(random_state=42,
                                                        sampling_strategy=0.5)),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )

params = [
            {'classifier__C'      :[0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga'],
          #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]},


          {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
          #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
          }]
  

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(unsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LOR_undersmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# XGB / UNDER

In [None]:
# XGB

from xgboost import XGBClassifier

unsmp_pipe = pipe_imb([('ct'        , ct),
                   #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomUnderSampler(random_state=42)),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=6, 
                                                    subsample=0.8, 
                                                    min_child_weight=25, 
                                                    gamma=1, 
                                                    reg_lambda=1, 
                                                    alpha=1, 
                                                    colsample_bytree=0.9, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )


params = [{'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
     #     'dim_red__n_components': [8, 10, 13,15],}
         ]

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(unsmp_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_XGB_undersmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# LGBM / UNDER

In [None]:
unsmp_pipe = pipe_imb([('ct'        , ct),
                  #     ('dim_red'   , PCA()),
                       ('sampler'   , RandomUnderSampler(random_state=42,
                                                        sampling_strategy=0.5)),
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                   #  class_weight="balanced",                                                   
                                                     n_estimators=1000, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                      max_depth=6, 
                                                     num_leaves=25, 
                                                     min_child_samples=25,                                                      
                                                     reg_alpha=1, 
                                                     reg_lambda=1,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.8, 
                                                     subsample=0.7,
                                                     min_split_gain = 25,
                                                      min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )

param_grid = [{
                'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                #     'dim_red__n_components': [8, 10, 13,15],
           #              'LGBM__n_estimators': [750,1000,1500], 
           #      'LGBM__learning_rate' : [0.0001,0.001,0.01],
               #   'LGBM__min_child_weight': [50,100,150], 
              #    'LGBM__max_depth' : [5], 
              #   'LGBM__num_leaves': [100,150,200,250,300], 
              #   'LGBM__min_child_samples': [100,150,200,250,300],                                                      
              #   'LGBM__reg_alpha' : [1,1.5,2,2.5,3], # default 0 
              #  'LGBM__min_data_in_leaf' : [100,150,200,250,300],
              #   'LGBM__reg_lambda' : [1,1.5,2,2.5,3], # default 0 
              #   'LGBM__subsample_freq' : [1,5,10,100,500], 
              # 'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
              # 'LGBM__colsample_bytree' : [0.5,0.6,0.7,0.8,0.9], 
              #  'LGBM__min_split_gain' : [122], 
              }]




# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(unsmp_pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LGBM_undersmp', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# SMOTE / LOR

In [None]:
smote_pipe = pipe_imb([('ct'        , ct),
                 #     ('dim_red'   , PCA()),
                       ('sampler'   , SMOTE(random_state=42,
                                           sampling_strategy=0.6,
                                           k_neighbors=4)),
                       ('classifier', LogisticRegression(random_state=42, max_iter=100000))
                          ]
                   )
params = [


            {'classifier__C'      :[0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l1'],
           'classifier__solver' : ['liblinear', 'saga'],
         #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]},


          {'classifier__C'      : [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
           'classifier__penalty': ['l2'], 
           'classifier__solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
        #     'dim_red__n_components': [8, 10, 13,15],
            'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
          }]

         

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(smote_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LoR_smote', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# SMOTE / XGB

In [None]:
# XGB

from xgboost import XGBClassifier

smote_pipe = pipe_imb([('ct'        , ct),
                  #     ('dim_red'   , PCA()),
                       ('sampler'   , SMOTE(random_state=42,
                                           sampling_strategy=0.6,
                                           k_neighbors=4)),
                       ('classifier', XGBClassifier(objective='binary:logistic', 
                                                    eval_metric = f1_macro,
                                                    n_estimators=1000, 
                                                    eta=0.01, # default 0.3
                                                    max_depth=6, 
                                                    subsample=0.8, 
                                                    min_child_weight=25, 
                                                    gamma=1, 
                                                    reg_lambda=1, 
                                                    alpha=1, 
                                                    colsample_bytree=0.9, 
                                                    #colsample_bylevel=0.5,
                                                   # scale_pos_weight = 0.35
                                                   )
                                                   )
                            ]
                   )

params = {
        'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
           "sampler__k_neighbors": [3,4,5,6,7,8,9,10],
             #     'dim_red__n_components': [8, 10, 13,15],
          #  'classifier__n_estimators':[750,1000,1500],
          # 'classifier__eta': [0.01,0.001,0.0001]
         #   'classifier__max_depth':[5,6],
         #  'classifier__min_child_weight': [25]
         #  'classifier__colsample_bytree':[0.5,0.6,0.7,0.8,0.9], #0.5
         #   'classifier__subsample' : [0.5,0.6,0.7,0.8,0.9],     #0.5
         #  'classifier__alpha':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
        #    'classifier__gamma':[0.2,0.3,0.4], #  defult 0. Increasing this value will make model more conservative.
          #  'classifier__reg_lambda':[0.5,1,1.5,2,2.5], #  def=1 .Increasing this value will make model more conservative.
          #  'classifier__scale_pos_weight' : [0.1,0.25,0.3,0.35,1]    #0.35
}
         

# train/validation with the same ratio of classes
kfold = StratifiedKFold(n_splits = 4, random_state = 42, shuffle=True) 

grid = GridSearchCV(smote_pipe, 
                    param_grid = params, 
                    cv = kfold, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_XGB_smote', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

# SMOTE / LGBM

In [None]:
smote_pipe = pipe_imb([('ct'        , ct),
                  #     ('dim_red'   , PCA()),
                       ('sampler'   , SMOTE(random_state=42,
                                           sampling_strategy=0.9,
                                           k_neighbors=10)),
                       ('LGBM',   lgb.LGBMClassifier(objective="binary",
                                                   #  class_weight="balanced",                                                   
                                                     n_estimators=1000, 
                                                     learning_rate=0.01,
                                                     min_child_weight=25, 
                                                      max_depth=6, 
                                                     num_leaves=25, 
                                                     min_child_samples=25,                                                      
                                                     reg_alpha=1, 
                                                     reg_lambda=1,
                                                 #    subsample_freq=0, 
                                                     colsample_bytree=0.8, 
                                                     subsample=0.7,
                                                     min_split_gain = 25,
                                                      min_data_in_leaf = 25,
                                                     random_state=42))
                          ]
                   )

param_grid = {
                'sampler__sampling_strategy': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                'sampler__k_neighbors': [3,4,5,6,7,8,9,10],
                #     'dim_red__n_components': [8, 10, 13,15],
              #       'LGBM__n_estimators': [750,1000,1500], 
                 #    'LGBM__learning_rate' : [0.0001,0.001,0.01],
                 #     'LGBM__min_child_weight': [100], 
                  #    'LGBM__max_depth' : [2], 
                  #   'LGBM__num_leaves': [100,150,200,250,300], 
                  #   'LGBM__min_child_samples': [100,150,200,250,300],                                                      
                  #   'LGBM__reg_alpha' : [1,1.5,2,2.5,3], # default 0 
                  #  'LGBM__min_data_in_leaf' : [100,150,200,250,300],
                  #   'LGBM__reg_lambda' : [1,1.5,2,2.5,3], # default 0 
                  #   'LGBM__subsample_freq' : [1,5,10,100,500], 
                  # 'LGBM__subsample' : [0.5,0.6,0.7,0.8,0.9], 
                  # 'LGBM__colsample_bytree' : [0.5,0.6,0.7,0.8,0.9], 
                  #  'LGBM__min_split_gain' : [122], 
             }




# train/validation with the same ratio of classes
kfolds = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

grid = GridSearchCV(smote_pipe, 
                    param_grid = param_grid, 
                    cv = kfolds, 
                    scoring = 'f1_macro', 
                    verbose = 1,
                    n_jobs = -1)

In [None]:
%%time
# Run the grid search 
grid.fit(X_train, y_train)
print(grid.best_estimator_,'\n')
print('Best parameters  :', grid.best_params_)
print('\nTraining F1_macro:', grid.score(X_train, y_train))
print('Test F1_macro :', grid.score(X_test, y_test))

In [None]:
pred_probs = grid.predict_proba(X_test)[:,1]
scores = print_results('D4_LGBM_smote', y_test, grid.predict(X_test), pred_probs)
dftmp = pd.DataFrame([scores], columns=score_names)
dfAcc = pd.concat([dfAcc, dftmp], ignore_index=True)
dfAcc[['TP','FP', 'TN', 'FN']] = dfAcc[['TP','FP', 'TN', 'FN']].astype(int)
dfAcc.style.hide_index()

In [None]:
dfAcc

# EXPORT

In [None]:
dfAcc_pck = dfAcc.copy()
import pickle 
fd = open("OM_D4_results_table_nx-same-params-tune_only_sampler_params", 'wb') 
pickle.dump(dfAcc_pck, fd)
fd.close()

# PERMUTATION IMPORTANCE

In [None]:
from sklearn.inspection import permutation_importance
r = permutation_importance(grid, X_train, y_train,
                               n_repeats=30,
                             random_state=42)

In [None]:
plt.figure(figsize=(20,10))
plt.plot(r.importances);

In [None]:
r.importances_mean # total net amount

In [None]:
X.columns[r.importances_mean>0.01]