In [12]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans

from util import load_pickle_file
from util import save_pickle_file
from util import report_test
from util import upsample_pos
from util import data_preprocessing
from util import rand_train_test

from imblearn.over_sampling import SMOTE

In [2]:
def train_kmeans(x, y, test=None):
    kmeans = KMeans(n_clusters=2, random_state=229).fit(x)
    
    if test is not None:
        x_test, y_test = test
        # clf_acc = report_test(kmeans, test, "kmeans")
        # print(kmeans.cluster_centers_)
        y_pred = kmeans.predict(x_test)
        print((y_pred == y_test).sum()/len(y_test))
        return kmeans.labels_, y_pred
    return kmeans, test, 'kmeans'

def train_svm(x, y, kernel_type, test=None):
    clf_svm = SVC(kernel='linear', probability=True)
    if kernel_type == 'poly':
        clf_svm = SVC(kernel='poly', degree=8, probability=True)
    elif kernel_type == 'rbf':
        clf_svm = SVC(kernel='rbf', probability=True)
    elif kernel_type == 'sigmoid':
        clf_svm = SVC(kernel='sigmoid', probability=True)
    clf_svm.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_svm, test, "svm")
        return clf_svm, test, 'svm'
    return clf_svm, test, 'svm'

def train_lr(x, y, rand_state=229, solver='liblinear',
        max_iter=10000, test=None):
    clf_lr = LogisticRegression(
        random_state=rand_state, solver=solver, max_iter=max_iter, C=0.0001)
    # clf_lr = LogisticRegression(C = 0.0001)
    clf_lr.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_lr, test, "logistic regression")
        return clf_lr, clf_acc
    return clf_lr

def train_rand_forest(x, y, n_est=100, max_depth=3, rand_state=229, test=None):
    # clf_rf = RandomForestClassifier(n_estimators=n_est, max_depth=max_depth,
    #     random_state=rand_state)
    clf_rf = RandomForestClassifier(n_estimators = 100, random_state = 50, n_jobs = -1)
    clf_rf.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_rf, test, "random forest")
        return clf_rf, clf_acc
    return clf_rf

def train_nb(x, y, test=None):
    clf_nb = GaussianNB().fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nb, test, "Gaussian Naive Bayes")
        return clf_nb, clf_acc
    return clf_nb

def train_mlp(x, y, solver='lbfgs', alpha=1e-4, hls=(10, 40, 40),
        rand_state=229, test=None):
    clf_nn = MLPClassifier(
        solver=solver, alpha=alpha, hidden_layer_sizes=hls,
        random_state=rand_state)
    clf_nn.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nn, test, "neural network")
        return clf_nn, clf_acc
    return clf_nn

def train_lgbm(x, y, test=None):
    clf_lgbm = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1, )
    clf_lgbm.fit(x, y, verbose=100)

    if test is not None:
        clf_acc = report_test(clf_lgbm, test, "LGBM")
        return clf_lgbm, clf_acc
    return clf_lgbm

In [4]:
training_data_path = './data_processed/training_data.pkl'
label_path = './data_processed/training_lbl.pkl'
# training_data_path = './data_processed/training_data_processed.pkl'
# label_path = './data_processed/training_lbl_processed.pkl'
# training_data_path = './data_processed/training_data.pkl'
# label_path = './data_processed/training_lbl.pkl'
data = load_pickle_file(training_data_path)
label = load_pickle_file(label_path)
print('Training data has been successfully loaded')

Training data has been successfully loaded


In [6]:
y = np.array(label)
x = data
# entries = list(data.columns)
x = np.array(x)
print(x.shape)
# raise
x, y = data_preprocessing(x, y, thres=0.2)

(307511, 275)
(307511, 275)
(307511,)


In [7]:
lr_acc_ls = []
rf_acc_ls = []
nb_acc_ls = []
nn_acc_ls = []
lgbm_acc_ls = []
# kf = KFold(n_splits=1, shuffle=True)
print('Training is starting ... ')
print('shape of x: {}'.format(x.shape))

Training is starting ... 
shape of x: (307511, 275)


In [None]:
# x, y, x_test, y_test = upsample_pos(x, y, upsample=True)
# x, y, x_test, y_test = rand_train_test(x, y)
# save_pickle_file(x, "training_data_up.pkl")
# save_pickle_file(y, "training_lbl_up.pkl")
# save_pickle_file(x_test, "testing_data_up.pkl")
# save_pickle_file(y_test, "testing_lbl_up.pkl")
# x = load_pickle_file('training_data_up.pkl')
# y = load_pickle_file('training_lbl_up.pkl')
# x_test = load_pickle_file('testing_data_up.pkl')
# y_test = load_pickle_file('testing_lbl_up.pkl')
# raise
# print('Percentage of zeros in trainset input: {}'.format(np.count_nonzero(x==0)/x.size))
# print('Number of positive examples: {}, negative: {}'.format((y==1).sum(), (y==0).sum()))
# # for train, test in kf.split(x):
# print("here")
# x_train, x_test, y_train, y_test = x, x_test, y, y_test
# print(x_train.shape)
# print(x_test.shape)
# print(len(y_test==1))
# print(len(y_test==0))

In [16]:
def balance_data(x, y, upsample=False, k_neighbors=1000):
    # less positive, more negative
    all_pos = np.where(y == 1)
    print(len(all_pos))
    x_all_pos = x[all_pos[0]]
    y_all_pos = y[all_pos[0]]

    all_neg = np.where(y == 0)
    print(len(all_neg))
    x_all_neg = x[all_neg[0]]
    y_all_neg = y[all_neg[0]]

    if upsample:
        rand_ind = np.arange(len(x_all_neg))
        np.random.shuffle(rand_ind)
        x_neg_new = x_all_neg[rand_ind[:2*len(x_all_pos)]]
        y_neg_new = y_all_neg[rand_ind[:2*len(x_all_pos)]]
        x_all_new = np.concatenate((x_neg_new, x_all_pos), axis=0)
        y_all_new = np.concatenate((y_neg_new, y_all_pos), axis=0)
        sm = SMOTE(random_state=233333, sampling_strategy=1.0, k_neighbors=k_neighbors)
        x_train, y_train = sm.fit_sample(x_all_new, y_all_new)
    else:
        # undersample: balance train set
        x_all_neg = x_all_neg[:int(5*len(x_all_pos))]
        y_all_neg = y_all_neg[:int(5*len(x_all_pos))]
        x_train = np.concatenate((x_all_neg, x_all_pos), axis=0)
        y_train = np.concatenate((y_all_neg, y_all_pos), axis=0)
    
    rand_shuffle = np.arange(len(x_train))
    np.random.shuffle(rand_shuffle)
    x_train = x_train[rand_shuffle]
    y_train = y_train[rand_shuffle]
    return x_train, y_train

In [17]:
NUM_CLUSTERS = 2
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, test=[x_test, y_test])
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=False, k_neighbors=1000)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    cur_test_x, cur_test_y = balance_data(cur_test_x, cur_test_y, upsample=False, k_neighbors=1000)
    
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    print('train 1 {}, train 0 {}'.format(len(cur_train_y==1), len(cur_train_y==0)))
    clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

19859
99295
0.4654998740871317
1
1
1
1
length of train set 78797, test set 6702
train 1 78797, train 0 78797
The accuracy for logistic regression classifier is: 0.4795583407937929
Prediction Positive Number: 0 True Number: 3488
Prediction Negative Number: 6702 True Number: 3214
              precision    recall  f1-score   support

           0       0.48      1.00      0.65      3214
           1       0.00      0.00      0.00      3488

    accuracy                           0.48      6702
   macro avg       0.24      0.50      0.32      6702
weighted avg       0.23      0.48      0.31      6702



  'precision', 'predicted', average, warn_for)


The accuracy for random forest classifier is: 0.8888391524917935
Prediction Positive Number: 2745 True Number: 3488
Prediction Negative Number: 3957 True Number: 3214
              precision    recall  f1-score   support

           0       0.81      1.00      0.90      3214
           1       1.00      0.79      0.88      3488

    accuracy                           0.89      6702
   macro avg       0.91      0.89      0.89      6702
weighted avg       0.91      0.89      0.89      6702

1
1
1
1
length of train set 11316, test set 1240
train 1 11316, train 0 11316
The accuracy for logistic regression classifier is: 0.6104838709677419
Prediction Positive Number: 0 True Number: 483
Prediction Negative Number: 1240 True Number: 757
              precision    recall  f1-score   support

           0       0.61      1.00      0.76       757
           1       0.00      0.00      0.00       483

    accuracy                           0.61      1240
   macro avg       0.31      0.50      0.3

  'precision', 'predicted', average, warn_for)


The accuracy for random forest classifier is: 0.9112903225806451
Prediction Positive Number: 383 True Number: 483
Prediction Negative Number: 857 True Number: 757
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       757
           1       0.99      0.78      0.87       483

    accuracy                           0.91      1240
   macro avg       0.93      0.89      0.90      1240
weighted avg       0.92      0.91      0.91      1240



In [20]:
NUM_CLUSTERS = 2
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, test=[x_test, y_test])
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=True, k_neighbors=500)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    
    b = np.bincount(cur_train_y)
    ii = np.nonzero(b)[0]
    print(np.vstack((ii,b[ii])).T)
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

19859
99295
0.4654998740871317
1
1
[[    0 28002]
 [    1 28002]]
length of train set 56004, test set 6702
train 1 56004, train 0 56004
The accuracy for logistic regression classifier is: 0.7555953446732319
Prediction Positive Number: 3544 True Number: 3488
Prediction Negative Number: 3158 True Number: 3214
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3214
           1       0.76      0.77      0.77      3488

    accuracy                           0.76      6702
   macro avg       0.76      0.75      0.76      6702
weighted avg       0.76      0.76      0.76      6702

The accuracy for random forest classifier is: 0.8976424947776783
Prediction Positive Number: 2918 True Number: 3488
Prediction Negative Number: 3784 True Number: 3214
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      3214
           1       0.98      0.82      0.89      3488

    accuracy                     