In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsRegressor
import category_encoders as ce
import pickle
from statsmodels.stats.outliers_influence import variance_inflation_factor as VIF
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from imblearn.combine import *

In [16]:
data = pd.read_csv('Data/groupDF_clean.csv', index_col=[0])

In [17]:
np.random.seed(5)
ids = np.random.permutation(data.index)
trainN = int(0.8*len(data))
trainIDs = ids[:trainN]
testIDs = ids[trainN:]

In [18]:
trainData = data.loc[trainIDs]
testData = data.loc[testIDs]

trainX = trainData.drop(['payroll_ind'], axis=1)
testX = testData.drop(['payroll_ind'], axis=1)

trainY = trainData['payroll_ind']
testY = testData['payroll_ind']

#### Combinations

- SMOTE --> ENN  
- SMOTE --> TOMEK  
- NCL (modifed ENN to do both US and OS)  
- ROS
- TOMEK --> CNN (OSS) (SKIPPED)

In [20]:
pipe = Pipeline([
    ('sampler', 'passthrough'),
    ('classify', LGBMClassifier(objective='binary', bagging_freq=5, early_stopping_rounds=20))
])

paramGrid1 = [
    {
    'sampler': [SMOTEENN()],
    'classify__learning_rate': [0.005, 0.01, 0.1],
    'classify__num_leaves': [5, 7, 10, 25],
    'classify__num_trees': [25, 50, 100, 250],
    'classify__lambda_l2': [0.1, 1, 10],
    'classify__feature_fraction': [0.7, 0.85, 1],
    'classify__bagging_fraction': [0.7, 0.85, 1],
    },
    {
    'sampler': [SMOTETomek()],
    'classify__learning_rate': [0.005, 0.01, 0.1],
    'classify__num_leaves': [5, 7, 10, 25],
    'classify__num_trees': [25, 50, 100, 250],
    'classify__lambda_l2': [0.1, 1, 10]
    'classify__feature_fraction': [0.7, 0.85, 1],
    'classify__bagging_fraction': [0.7, 0.85, 1],
    }
]

paramGrid2 = [
    {
    'sampler': [RandomOverSampler(sampling_strategy=0.7)],
    'classify__learning_rate': [0.005, 0.01, 0.1],
    'classify__num_leaves': [5, 7, 10, 25],
    'classify__num_trees': [25, 50, 100, 250],
    'classify__lambda_l2': [0.1, 1, 10],
    'classify__feature_fraction': [0.7, 0.85, 1],
    'classify__bagging_fraction': [0.7, 0.85, 1],
    },
    {
    'sampler': [NeighbourhoodCleaningRule()],
    'classify__learning_rate': [0.005, 0.01, 0.1],
    'classify__num_leaves': [5, 7, 10, 25],
    'classify__num_trees': [25, 50, 100, 250],
    'classify__lambda_l2': [0.1, 1, 10]
    'classify__feature_fraction': [0.7, 0.85, 1],
    'classify__bagging_fraction': [0.7, 0.85, 1],
    }
]

uselessCols = ['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score']

SyntaxError: invalid syntax (<ipython-input-20-a5c459e80ae0>, line 13)

In [None]:
search1 = GridSearchCV(pipe, n_jobs=-1, param_grid=paramGrid1, verbose=10, cv=5, scoring='roc_auc')
search1.fit(trainX.values, trainY)

In [None]:
result1 = pd.DataFrame(search1.cv_results_).sort_values(by='rank_test_score', axis=0).drop(uselessCols, axis=1)
pickle.dump(result1, open('cvResults_1.sav', 'wb'))
result1

In [None]:
search2 = GridSearchCV(pipe, n_jobs=-1, param_grid=paramGrid2, verbose=10, cv=5, scoring='roc_auc')
search2.fit(trainX.values, trainY)

In [23]:
result2 = pd.DataFrame(search2.cv_results_).sort_values(by='rank_test_score', axis=0).drop(uselessCols, axis=1)
pickle.dump(result2, open('cvResults_2.sav', 'wb'))
result2

Unnamed: 0,param_classify__feature_fraction,param_classify__learning_rate,param_classify__num_leaves,param_classify__num_trees,param_sampler1,param_sampler2,mean_test_score,std_test_score,rank_test_score
39,0.85,0.005,10,250,RandomUnderSampler(sampling_strategy=0.7),RandomOverSampler(),0.634242,0.023208,1
268,0.85,0.007,10,300,TomekLinks(),RandomOverSampler(),0.632534,0.019432,2
300,1,0.007,7,250,TomekLinks(),RandomOverSampler(),0.631279,0.022222,3
288,1,0.005,7,250,TomekLinks(),RandomOverSampler(),0.630810,0.017760,4
292,1,0.005,10,300,TomekLinks(),RandomOverSampler(),0.630196,0.022509,5
...,...,...,...,...,...,...,...,...,...
35,0.7,0.01,25,350,RandomUnderSampler(sampling_strategy=0.7),RandomOverSampler(),0.602754,0.029174,320
71,0.85,0.01,25,350,RandomUnderSampler(sampling_strategy=0.7),RandomOverSampler(),0.602175,0.030578,321
47,0.85,0.005,25,350,RandomUnderSampler(sampling_strategy=0.7),RandomOverSampler(),0.601586,0.008924,322
79,1,0.005,15,300,RandomUnderSampler(sampling_strategy=0.7),RandomOverSampler(),0.600892,0.016079,323
