In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.cluster import KMeans

from util import load_pickle_file
from util import save_pickle_file
from util import report_test
from util import upsample_pos
from util import data_preprocessing
from util import rand_train_test

from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn import ensemble

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
Using TensorFlow backend.


In [2]:
def plot_roc_curve(clf, test, clf_name):
    x_test, y_test = test
    
    fpr = list()
    tpr = list()
    aucs = list()
    for i in range(len(clf)):
        fpr, tpr, _ = roc_curve(y_test, clf[i].predict_proba(x_test)[:,1])
        roc_auc = auc(fpr, tpr)
    
        lw = 2
        plt.plot(fpr, tpr, label='ROC curve for ' + clf_name[i] + ' (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc=0)
    plt.show()

In [3]:
def train_gboost(x, y, test=None):
    clf = ensemble.GradientBoostingClassifier(n_estimators=1000, max_leaf_nodes=4, max_depth=None, random_state=2, min_samples_split=5)
    clf.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf, test, "gradient boosting")
        return clf, clf_acc
    return clf, test, "gradient boosting"

def train_kmeans(x, y, test=None, k=2):
    kmeans = KMeans(n_clusters=k, random_state=229).fit(x)
    
    if test is not None:
        x_test, y_test = test
        # clf_acc = report_test(kmeans, test, "kmeans")
        # print(kmeans.cluster_centers_)
        y_pred = kmeans.predict(x_test)
        print((y_pred == y_test).sum()/len(y_test))
        return kmeans.labels_, y_pred
    return kmeans, test, 'kmeans'

def train_svm(x, y, kernel_type, test=None):
    clf_svm = SVC(kernel='linear', probability=True)
    if kernel_type == 'poly':
        clf_svm = SVC(kernel='poly', degree=8, probability=True)
    elif kernel_type == 'rbf':
        clf_svm = SVC(kernel='rbf', probability=True)
    elif kernel_type == 'sigmoid':
        clf_svm = SVC(kernel='sigmoid', probability=True)
    clf_svm.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_svm, test, "svm")
        return clf_svm, test, 'svm'
    return clf_svm, test, 'svm'

def train_lr(x, y, rand_state=44, solver='liblinear',
        max_iter=10000, test=None):
    clf_lr = LogisticRegression(
        random_state=rand_state, solver=solver, max_iter=max_iter, C=0.0001)
    # clf_lr = LogisticRegression(C = 0.0001)
    clf_lr.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_lr, test, "logistic regression")
        return clf_lr, clf_acc
    return clf_lr, test, 'logistic regression'

def train_rand_forest(x, y, n_est=100, max_depth=3, rand_state=44, test=None):
    # clf_rf = RandomForestClassifier(n_estimators=n_est, max_depth=max_depth,
    #     random_state=rand_state)
    clf_rf = RandomForestClassifier(n_estimators = 100, random_state = 50, n_jobs = -1)
    clf_rf.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_rf, test, "random forest")
        return clf_rf, clf_acc
    return clf_rf, test, "random forest"

def train_nb(x, y, test=None):
    clf_nb = GaussianNB().fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nb, test, "Gaussian Naive Bayes")
        return clf_nb, clf_acc
    return clf_nb, test, "Gaussian Naive Bayes"

def train_mlp(x, y, solver='lbfgs', alpha=1e-4, hls=(10, 40, 40),
        rand_state=229, test=None):
    clf_nn = MLPClassifier(
        solver=solver, alpha=alpha, hidden_layer_sizes=hls,
        random_state=rand_state)
    clf_nn.fit(x, y)
    if test is not None:
        clf_acc = report_test(clf_nn, test, "neural network")
        return clf_nn, clf_acc
    return clf_nn

def train_lgbm(x, y, test=None):
    clf_lgbm = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.041545473,
        reg_lambda=0.0735294,
        min_split_gain=0.0222415,
        min_child_weight=39.3259775,
        silent=-1,
        verbose=-1, )
    clf_lgbm.fit(x, y, verbose=100)

    if test is not None:
        clf_acc = report_test(clf_lgbm, test, "LGBM")
        return clf_lgbm, clf_acc
    return clf_lgbm, test, "LGBM"

In [4]:
training_data_path = './data_processed/training_data.pkl'
label_path = './data_processed/training_lbl.pkl'
# training_data_path = './data_processed/training_data_processed.pkl'
# label_path = './data_processed/training_lbl_processed.pkl'
# training_data_path = './data_processed/training_data.pkl'
# label_path = './data_processed/training_lbl.pkl'
data = load_pickle_file(training_data_path)
label = load_pickle_file(label_path)
print('Training data has been successfully loaded')

Training data has been successfully loaded


In [5]:
y = np.array(label)
x = data
# entries = list(data.columns)
x = np.array(x)
print(x.shape)
# raise
x, y = data_preprocessing(x, y, thres=0.0)

(307511, 613)
(307511, 613)
(307511,)


In [6]:
lr_acc_ls = []
rf_acc_ls = []
nb_acc_ls = []
nn_acc_ls = []
lgbm_acc_ls = []
# kf = KFold(n_splits=1, shuffle=True)
print('Training is starting ... ')
print('shape of x: {}'.format(x.shape))

Training is starting ... 
shape of x: (307511, 613)


In [6]:
# x, y, x_test, y_test = upsample_pos(x, y, upsample=True)
# x, y, x_test, y_test = rand_train_test(x, y)
# save_pickle_file(x, "training_data_up.pkl")
# save_pickle_file(y, "training_lbl_up.pkl")
# save_pickle_file(x_test, "testing_data_up.pkl")
# save_pickle_file(y_test, "testing_lbl_up.pkl")
# x = load_pickle_file('training_data_up.pkl')
# y = load_pickle_file('training_lbl_up.pkl')
# x_test = load_pickle_file('testing_data_up.pkl')
# y_test = load_pickle_file('testing_lbl_up.pkl')
# raise
# print('Percentage of zeros in trainset input: {}'.format(np.count_nonzero(x==0)/x.size))
# print('Number of positive examples: {}, negative: {}'.format((y==1).sum(), (y==0).sum()))
# # for train, test in kf.split(x):
# print("here")
# x_train, x_test, y_train, y_test = x, x_test, y, y_test
# print(x_train.shape)
# print(x_test.shape)
# print(len(y_test==1))
# print(len(y_test==0))

In [7]:
def balance_data(x, y, upsample=False, k_neighbors=1000):
    # less positive, more negative
    all_pos = np.where(y == 1)
    print(len(all_pos))
    x_all_pos = x[all_pos[0]]
    y_all_pos = y[all_pos[0]]

    all_neg = np.where(y == 0)
    print(len(all_neg))
    x_all_neg = x[all_neg[0]]
    y_all_neg = y[all_neg[0]]

    if upsample:
        rand_ind = np.arange(len(x_all_neg))
        np.random.shuffle(rand_ind)
        x_neg_new = x_all_neg[rand_ind[:2*len(x_all_pos)]]
        y_neg_new = y_all_neg[rand_ind[:2*len(x_all_pos)]]
        x_all_new = np.concatenate((x_neg_new, x_all_pos), axis=0)
        y_all_new = np.concatenate((y_neg_new, y_all_pos), axis=0)
        sm = SMOTE(random_state=233333, sampling_strategy=1.0, k_neighbors=k_neighbors)
        x_train, y_train = sm.fit_sample(x_all_new, y_all_new)
    else:
        # undersample: balance train set
#         x_all_neg = x_all_neg[:int(5*len(x_all_pos))]
#         y_all_neg = y_all_neg[:int(5*len(x_all_pos))]
        x_all_neg = x_all_neg[:len(x_all_pos)]
        y_all_neg = y_all_neg[:len(x_all_pos)]
        x_train = np.concatenate((x_all_neg, x_all_pos), axis=0)
        y_train = np.concatenate((y_all_neg, y_all_pos), axis=0)
    
    rand_shuffle = np.arange(len(x_train))
    np.random.shuffle(rand_shuffle)
    x_train = x_train[rand_shuffle]
    y_train = y_train[rand_shuffle]
    return x_train, y_train

In [8]:
def count_values(x):
    b = np.bincount(x)
    ii = np.nonzero(b)[0]
    print(np.vstack((ii,b[ii])).T)

In [None]:
NUM_CLUSTERS = 2
x_train, y_train, x_test, y_test = upsample_pos(x, y, upsample=False)
train_group, test_group = train_kmeans(x_train, y_train, test=[x_test, y_test], k=NUM_CLUSTERS)
for i in range(NUM_CLUSTERS):
    cur_train_x, cur_train_y = x_train[train_group==i], y_train[train_group==i]
    count_values(cur_train_y)
    cur_train_x, cur_train_y = balance_data(cur_train_x, cur_train_y, upsample=False, k_neighbors=500)
    
    cur_test_x, cur_test_y = x_test[test_group==i], y_test[test_group==i]
    
    count_values(cur_train_y)
    print('length of train set {}, test set {}'.format(len(cur_train_x), len(cur_test_x)))
    clf_lr, lr_acc = train_lr(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf_rf, rf_acc = train_rand_forest(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])
    clf = train_gboost(cur_train_x, cur_train_y, test=[cur_test_x, cur_test_y])

24825
282686
0.4689828801611279
[[    0 16207]
 [    1 17489]]
1
1
[[    0 16207]
 [    1 17489]]
length of train set 33696, test set 8382
The accuracy for logistic regression classifier is: 0.6160820806490098
Prediction Positive Number: 5975 True Number: 4345
Prediction Negative Number: 2407 True Number: 4037
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      4037
           1       0.52      1.00      0.68      4345

    accuracy                           0.52      8382
   macro avg       0.76      0.50      0.34      8382
weighted avg       0.75      0.52      0.35      8382

The accuracy for random forest classifier is: 0.650560725363875
Prediction Positive Number: 4498 True Number: 4345
Prediction Negative Number: 3884 True Number: 4037
              precision    recall  f1-score   support

           0       0.73      0.31      0.43      4037
           1       0.58      0.89      0.70      4345

    accuracy                   

In [None]:
def train_kmeans_x(x, k=2):
    kmeans = KMeans(n_clusters=k, random_state=44).fit(x)
    return kmeans.labels_

In [13]:
NUM_CLUSTERS = 2

groups = train_kmeans_x(x, k=NUM_CLUSTERS)
for i in range(NUM_CLUSTERS):
    cur_x, cur_y = x[groups==i], y[groups==i]
    count_values(cur_y)
    x_train, y_train, x_test, y_test = upsample(cur_x, cur_y, upsample=True)
    count_values(y_train)
    train_lr(x_train, y_train, test=[x_test, y_test])
    train_rand_forest(x_train, y_train, test=[x_test, y_test])

NameError: name 'train_kmeans_x' is not defined

In [13]:
def upsample(x, y, upsample=True):
    # less positive, more negative
    all_pos = np.where(y == 1)
    print(len(all_pos[0]))
    x_all_pos = x[all_pos[0]]
    y_all_pos = y[all_pos[0]]
    cut_len = len(x_all_pos) // 5
    x_test = x_all_pos[:cut_len]
    y_test = y_all_pos[:cut_len]
    x_all_pos = x_all_pos[cut_len + 1:]
    y_all_pos = y_all_pos[cut_len + 1:]

    all_neg = np.where(y == 0)
    print(len(all_neg[0]))
    x_all_neg = x[all_neg[0]]
    y_all_neg = y[all_neg[0]]
    x_test = np.concatenate((x_test, x_all_neg[:cut_len]), axis=0)
    y_test = np.concatenate((y_test, y_all_neg[:cut_len]), axis=0)
    x_all_neg = x_all_neg[cut_len + 1:]
    y_all_neg = y_all_neg[cut_len + 1:]

    if upsample:
        rand_ind = np.arange(len(x_all_neg))
        np.random.shuffle(rand_ind)
        x_neg_new = x_all_neg[rand_ind[:2*len(x_all_pos)]]
        y_neg_new = y_all_neg[rand_ind[:2*len(x_all_pos)]]
        x_all_new = np.concatenate((x_neg_new, x_all_pos), axis=0)
        y_all_new = np.concatenate((y_neg_new, y_all_pos), axis=0)
        sm = SMOTE(random_state=233333, sampling_strategy=1.0, k_neighbors=1000)
        x_train, y_train = sm.fit_sample(x_all_new, y_all_new)
    else:
        # undersample: balance train set
        x_all_neg = x_all_neg[:int(len(x_all_pos))]
        y_all_neg = y_all_neg[:int(len(x_all_pos))]
        x_train = np.concatenate((x_all_neg, x_all_pos), axis=0)
        y_train = np.concatenate((y_all_neg, y_all_pos), axis=0)
    
    rand_shuffle = np.arange(len(x_train))
    np.random.shuffle(rand_shuffle)
    x_train = x_train[rand_shuffle]
    y_train = y_train[rand_shuffle]
    
    rand_shuffle_test = np.arange(len(x_test))
    np.random.shuffle(rand_shuffle_test)
    x_test = x_test[rand_shuffle_test]
    y_test = y_test[rand_shuffle_test]
    return x_train, y_train, x_test, y_test