In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans

from util import load_pickle_file
from util import save_pickle_file
from util import report_test
from util import upsample_pos
from util import data_preprocessing
from util import rand_train_test

from imblearn.over_sampling import SMOTE

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def train_kmeans(x, y, n_clu=2, test=None):
    kmeans = KMeans(n_clusters=n_clu, random_state=229).fit(x)
    
    if test is not None:
        x_test, y_test = test
        # clf_acc = report_test(kmeans, test, "kmeans")
        # print(kmeans.cluster_centers_)
        y_pred = kmeans.predict(x_test)
        print((y_pred == y_test).sum()/len(y_test))
        return kmeans.labels_, y_pred
    return kmeans, test, 'kmeans'

def train_svm(x, y, kernel_type, test=None):
    clf_svm = SVC(kernel='linear', probability=True)
    if kernel_type == 'poly':
        clf_svm = SVC(kernel='poly', degree=8, probability=True)
    elif kernel_type == 'rbf':
        clf_svm = SVC(kernel='rbf', probability=True)
    elif kernel_type == 'sigmoid':
        clf_svm = SVC(kernel='sigmoid', probability=True)
    clf_svm.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_svm, test, "svm")
        return clf_svm, test, 'svm'
    return clf_svm, test, 'svm'

def train_lr(x, y, rand_state=229, solver='liblinear',
        max_iter=10000, test=None):
    clf_lr = LogisticRegression(
        random_state=rand_state, solver=solver, max_iter=max_iter, C=0.0001, class_weight="balanced")
    # clf_lr = LogisticRegression(C = 0.0001)
    clf_lr.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_lr, test, "logistic regression")
        return clf_lr, clf_acc
    return clf_lr

def train_rand_forest(x, y, n_est=100, max_depth=3, rand_state=229, test=None):
    # clf_rf = RandomForestClassifier(n_estimators=n_est, max_depth=max_depth,
    #     random_state=rand_state)
    clf_rf = RandomForestClassifier(n_estimators = 100, random_state = 50, n_jobs = -1, class_weight="balanced")
    clf_rf.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_rf, test, "random forest")
        return clf_rf, clf_acc
    return clf_rf

def train_nb(x, y, test=None):
    clf_nb = GaussianNB().fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nb, test, "Gaussian Naive Bayes")
        return clf_nb, clf_acc
    return clf_nb

def train_mlp(x, y, solver='lbfgs', alpha=1e-4, hls=(10, 40, 40),
        rand_state=229, test=None):
    clf_nn = MLPClassifier(
        solver=solver, alpha=alpha, hidden_layer_sizes=hls,
        random_state=rand_state)
    clf_nn.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nn, test, "neural network")
        return clf_nn, clf_acc
    return clf_nn

def train_lgbm(x, y, test=None):
    clf_lgbm = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1, )
    clf_lgbm.fit(x, y, verbose=100)

    if test is not None:
        clf_acc = report_test(clf_lgbm, test, "LGBM")
        return clf_lgbm, clf_acc
    return clf_lgbm

In [3]:
training_data_path = './data_processed/training_data.pkl'
label_path = './data_processed/training_lbl.pkl'
# training_data_path = './data_processed/training_data_processed.pkl'
# label_path = './data_processed/training_lbl_processed.pkl'
# training_data_path = './data_processed/training_data.pkl'
# label_path = './data_processed/training_lbl.pkl'
data = load_pickle_file(training_data_path)
label = load_pickle_file(label_path)
print('Training data has been successfully loaded')

Training data has been successfully loaded


In [4]:
y = np.array(label)
x = data
# entries = list(data.columns)
x = np.array(x)
print(x.shape)
# raise
x, y = data_preprocessing(x, y, thres=0.1)

(307511, 651)
(307511, 651)
(307511,)


In [5]:
lr_acc_ls = []
rf_acc_ls = []
nb_acc_ls = []
nn_acc_ls = []
lgbm_acc_ls = []
# kf = KFold(n_splits=1, shuffle=True)
print('Training is starting ... ')
print('shape of x: {}'.format(x.shape))

Training is starting ... 
shape of x: (307511, 651)


In [6]:
# x, y, x_test, y_test = upsample_pos(x, y, upsample=True)
# x, y, x_test, y_test = rand_train_test(x, y)
# save_pickle_file(x, "training_data_up.pkl")
# save_pickle_file(y, "training_lbl_up.pkl")
# save_pickle_file(x_test, "testing_data_up.pkl")
# save_pickle_file(y_test, "testing_lbl_up.pkl")
# x = load_pickle_file('training_data_up.pkl')
# y = load_pickle_file('training_lbl_up.pkl')
# x_test = load_pickle_file('testing_data_up.pkl')
# y_test = load_pickle_file('testing_lbl_up.pkl')
# raise
# print('Percentage of zeros in trainset input: {}'.format(np.count_nonzero(x==0)/x.size))
# print('Number of positive examples: {}, negative: {}'.format((y==1).sum(), (y==0).sum()))
# # for train, test in kf.split(x):
# print("here")
# x_train, x_test, y_train, y_test = x, x_test, y, y_test
# print(x_train.shape)
# print(x_test.shape)
# print(len(y_test==1))
# print(len(y_test==0))

In [7]:
def balance_data(x, y, upsample=False, k_neighbors=1000):
    # less positive, more negative
    all_pos = np.where(y == 1)
    print(len(all_pos))
    x_all_pos = x[all_pos[0]]
    y_all_pos = y[all_pos[0]]

    all_neg = np.where(y == 0)
    print(len(all_neg))
    x_all_neg = x[all_neg[0]]
    y_all_neg = y[all_neg[0]]

    if upsample:
        rand_ind = np.arange(len(x_all_neg))
        np.random.shuffle(rand_ind)
        x_neg_new = x_all_neg[rand_ind[:5*len(x_all_pos)]]
        y_neg_new = y_all_neg[rand_ind[:5*len(x_all_pos)]]
        x_all_new = np.concatenate((x_neg_new, x_all_pos), axis=0)
        y_all_new = np.concatenate((y_neg_new, y_all_pos), axis=0)
        sm = SMOTE(random_state=233333, sampling_strategy=1.0, k_neighbors=k_neighbors)
        x_train, y_train = sm.fit_sample(x_all_new, y_all_new)
    else:
        # undersample: balance train set
#         x_all_neg = x_all_neg[:int(5*len(x_all_pos))]
#         y_all_neg = y_all_neg[:int(5*len(x_all_pos))]
        x_all_neg = x_all_neg[:5 *len(x_all_pos)]
        y_all_neg = y_all_neg[:5 *len(x_all_pos)]
        x_train = np.concatenate((x_all_neg, x_all_pos), axis=0)
        y_train = np.concatenate((y_all_neg, y_all_pos), axis=0)
    
    rand_shuffle = np.arange(len(x_train))
    np.random.shuffle(rand_shuffle)
    x_train = x_train[rand_shuffle]
    y_train = y_train[rand_shuffle]
    return x_train, y_train

In [8]:
def count_values(x):
    b = np.bincount(x)
    ii = np.nonzero(b)[0]
    print(np.vstack((ii,b[ii])).T)

In [11]:
NUM_CLUSTERS = 4
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, test=[x_test, y_test])
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=False, k_neighbors=1000)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    cur_test_x, cur_test_y = balance_data(cur_test_x, cur_test_y, upsample=False, k_neighbors=1000)
    
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    print('train 1 {}, train 0 {}'.format(len(cur_train_y==1), len(cur_train_y==0)))
    clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_nb, nb_acc = train_nb(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_mlp, mlp_acc = train_mlp(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_lgbm, lgbm_acc = train_lgbm(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

24825
282686
0.5313192346424975
1
1
1
1
length of train set 4744, test set 1240
train 1 4744, train 0 4744
The accuracy for logistic regression classifier is: 0.6645161290322581
Prediction Positive Number: 510 True Number: 620
Prediction Negative Number: 730 True Number: 620
              precision    recall  f1-score   support

           0       0.64      0.75      0.69       620
           1       0.70      0.58      0.63       620

    accuracy                           0.66      1240
   macro avg       0.67      0.66      0.66      1240
weighted avg       0.67      0.66      0.66      1240

The accuracy for random forest classifier is: 0.6612903225806451
Prediction Positive Number: 218 True Number: 620
Prediction Negative Number: 1022 True Number: 620
              precision    recall  f1-score   support

           0       0.60      0.99      0.74       620
           1       0.96      0.34      0.50       620

    accuracy                           0.66      1240
   macro avg   

  'precision', 'predicted', average, warn_for)


1
1
1
length of train set 33680, test set 8379
train 1 33680, train 0 33680
The accuracy for logistic regression classifier is: 0.6986513903807137
Prediction Positive Number: 4030 True Number: 4345
Prediction Negative Number: 4349 True Number: 4034
              precision    recall  f1-score   support

           0       0.67      0.73      0.70      4034
           1       0.73      0.67      0.70      4345

    accuracy                           0.70      8379
   macro avg       0.70      0.70      0.70      8379
weighted avg       0.70      0.70      0.70      8379

The accuracy for random forest classifier is: 0.6231053825038787
Prediction Positive Number: 1193 True Number: 4345
Prediction Negative Number: 7186 True Number: 4034
              precision    recall  f1-score   support

           0       0.56      1.00      0.72      4034
           1       1.00      0.27      0.43      4345

    accuracy                           0.62      8379
   macro avg       0.78      0.64      

In [15]:
NUM_CLUSTERS = 4
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, n_clu=NUM_CLUSTERS, test=[x_test, y_test])
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    count_values(cur_train_y)
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=False, k_neighbors=500)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    
    count_values(cur_train_y)
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_nb, nb_acc = train_nb(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_mlp, mlp_acc = train_mlp(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_lgbm, lgbm_acc = train_lgbm(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

24825
282686
0.26163141993957706
[[   0 3654]
 [   1 2368]]
1
1
[[   0 3654]
 [   1 2368]]
length of train set 6022, test set 1546
The accuracy for logistic regression classifier is: 0.6824062095730918
Prediction Positive Number: 574 True Number: 617
Prediction Negative Number: 972 True Number: 929
              precision    recall  f1-score   support

           0       0.73      0.76      0.74       929
           1       0.61      0.57      0.59       617

    accuracy                           0.68      1546
   macro avg       0.67      0.66      0.66      1546
weighted avg       0.68      0.68      0.68      1546

The accuracy for random forest classifier is: 0.6940491591203105
Prediction Positive Number: 148 True Number: 617
Prediction Negative Number: 1398 True Number: 929
              precision    recall  f1-score   support

           0       0.66      1.00      0.80       929
           1       0.99      0.24      0.38       617

    accuracy                           0.69  

The accuracy for random forest classifier is: 0.5687943262411348
Prediction Positive Number: 436 True Number: 1314
Prediction Negative Number: 1679 True Number: 801
              precision    recall  f1-score   support

           0       0.47      0.98      0.63       801
           1       0.96      0.32      0.48      1314

    accuracy                           0.57      2115
   macro avg       0.71      0.65      0.56      2115
weighted avg       0.77      0.57      0.54      2115

The accuracy for Gaussian Naive Bayes classifier is: 0.4052009456264775
Prediction Positive Number: 138 True Number: 1314
Prediction Negative Number: 1977 True Number: 801
              precision    recall  f1-score   support

           0       0.38      0.95      0.55       801
           1       0.70      0.07      0.13      1314

    accuracy                           0.41      2115
   macro avg       0.54      0.51      0.34      2115
weighted avg       0.58      0.41      0.29      2115

The accur

In [15]:
import torch
from torch import nn
from torch.autograd import Variable
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from mlp import MLP
from util import load_pickle_file
from util import upsample_pos
num_epochs = 80
bs = 100
learning_rate = 1e-4
net = MLP()
def train_nn(x, y, x_test, y_test):
    num_bs = len(x) // bs
    criterion = nn.CrossEntropyLoss(weight=torch.tensor(np.array([1., 1.])).float())  
    optimizer = torch.optim.AdamW(net.parameters(), lr=learning_rate, weight_decay=1e-4)
    for epoch in range(num_epochs):
        for ii in range(num_bs - 1):  
            # Convert torch tensor to Variable
            curr_data = x[ii * bs: (ii + 1) * bs]
            curr_labels = y[ii * bs: (ii + 1) * bs]
            # print(curr_data.shape)
            # print(type(curr_data.view(-1, 651)))
            curr_data = Variable(torch.tensor(curr_data).float())
            curr_labels = Variable(torch.tensor(curr_labels).long())

            # Forward + Backward + Optimize
            optimizer.zero_grad()  # zero the gradient buffer
            outputs = net(curr_data)
            loss = criterion(outputs, curr_labels)
            loss.backward()
            optimizer.step()

            if (ii+1) % 100 == 0:
                _, predicted = torch.max(outputs.data, 1)
                print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                       %(epoch+1, num_epochs, ii+1, len(x)//bs, loss.data))
                print("train acc:" + str(1 - len(torch.nonzero(predicted - curr_labels)) * 1.0 / len(predicted)))
                test_data = Variable(torch.tensor(x_test).float())
                test_labels = Variable(torch.tensor(y_test).long())
                outputs = net(test_data)
                _, predicted = torch.max(outputs.data, 1)
                print("test acc:" + str(1 - len(torch.nonzero(predicted - test_labels)) * 1.0 / len(test_labels)))
    test_data = Variable(torch.tensor(x_test).float())
    test_labels = Variable(torch.tensor(y_test).long())
    outputs = net(test_data)
    _, predicted = torch.max(outputs.data, 1)
    print("f1: " + str(f1_score(test_labels, predicted > 0.5, average=None)))
    print("precision: " + str(precision_score(test_labels, predicted > 0.5, average=None)))
    print("recall: " + str(recall_score(test_labels, predicted > 0.5, average=None)))


In [16]:
NUM_CLUSTERS = 4
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, n_clu=NUM_CLUSTERS, test=[x_test, y_test])
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    count_values(cur_train_y)
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=False, k_neighbors=500)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    
    count_values(cur_train_y)
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    train_nn(cur_train_x, cur_train_y, cur_test_x, cur_test_y)
#     clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
#     clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
#     clf_nb, nb_acc = train_nb(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
#     clf_mlp, mlp_acc = train_mlp(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
#     clf_lgbm, lgbm_acc = train_lgbm(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

24825
282686
0.3323262839879154
[[   0 7946]
 [   1 6854]]
1
1
[[   0 7946]
 [   1 6854]]
length of train set 14800, test set 3754
Epoch [1/80], Step [100/148], Loss: 0.7410
train acc:0.41000000000000003
test acc:0.4608417687799681
Epoch [2/80], Step [100/148], Loss: 0.7015
train acc:0.41000000000000003
test acc:0.4608417687799681
Epoch [3/80], Step [100/148], Loss: 0.6896
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [4/80], Step [100/148], Loss: 0.6855
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [5/80], Step [100/148], Loss: 0.6838
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [6/80], Step [100/148], Loss: 0.6830
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [7/80], Step [100/148], Loss: 0.6824
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [8/80], Step [100/148], Loss: 0.6806
train acc:0.5900000000000001
test acc:0.539158231220032
Epoch [9/80], Step [100/148], Loss: 0.6761
train acc:0.5900000000000001
t

Epoch [6/80], Step [100/103], Loss: 0.5932
train acc:0.7
test acc:0.6728323699421965
Epoch [7/80], Step [100/103], Loss: 0.5931
train acc:0.7
test acc:0.6751445086705202
Epoch [8/80], Step [100/103], Loss: 0.5930
train acc:0.7
test acc:0.6759152215799615
Epoch [9/80], Step [100/103], Loss: 0.5930
train acc:0.69
test acc:0.6770712909441233
Epoch [10/80], Step [100/103], Loss: 0.5929
train acc:0.69
test acc:0.6751445086705202
Epoch [11/80], Step [100/103], Loss: 0.5929
train acc:0.7
test acc:0.6743737957610789
Epoch [12/80], Step [100/103], Loss: 0.5929
train acc:0.7
test acc:0.6743737957610789
Epoch [13/80], Step [100/103], Loss: 0.5929
train acc:0.7
test acc:0.6751445086705202
Epoch [14/80], Step [100/103], Loss: 0.5929
train acc:0.7
test acc:0.6751445086705202
Epoch [15/80], Step [100/103], Loss: 0.5930
train acc:0.7
test acc:0.6755298651252408
Epoch [16/80], Step [100/103], Loss: 0.5930
train acc:0.7
test acc:0.6774566473988439
Epoch [17/80], Step [100/103], Loss: 0.5930
train acc:0.