# Train models for binary classification
## White-box and Black-box models
White-Box:

    - Logistic Regression (logreg): cb12_logreg.pikle, 
    - Decision Tree (DT), cb12_dt.pikle
    - Naive Bayes (NB), cb12_nb.pikle

Black-box:

    - Adaboost (Ada), cb12_ada.pikle
    - Linear Discriminant Analysis (LDA) - cb12_lda.pikle, 
    - and Quadratic Discriminant Analysis (QDA) - cb12_qda.pikle
    - XgBoost: max depth: 2, 25 trees

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Load training and testing set

In [3]:
X_train = np.load('./output_baseline/X_train_159.npy')
Y_train = np.load("./output_baseline/Y_train_159.npy")
X_test = np.load("./output_baseline/X_test_159.npy")
Y_test = np.load("./output_baseline/Y_test_159.npy")

In [4]:
X_train.shape, Y_train.shape # ((563890, 159), 563889)

((563889, 159), (563889,))

In [5]:
X_test.shape, len(Y_test) # ((15737, 159), 15736)

((15736, 159), 15736)

In [6]:
def show_result(y_true, y_prediction):
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prediction))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

## Logistic Regression

In [11]:
%%time
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, Y_train)
y_pred = logreg.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8748    0.9542    0.9128      7868
1             1    0.9497    0.8634    0.9045      7868
2      accuracy                        0.9088     15736
3     macro avg    0.9122    0.9088    0.9086     15736
4  weighted avg    0.9122    0.9088    0.9086     15736
5       overall  0.949671  0.863371  0.904467  0.908808
CPU times: user 5min 8s, sys: 29.8 s, total: 5min 38s
Wall time: 1min 28s


In [12]:
# Save models
import pickle
model_name = './output_baseline/cb12_logreg.pikle'
obj = pickle.dump(logreg, open(model_name, "wb"))

## Naive Bayes

In [13]:
%%time
nb = GaussianNB()
nb.fit(X_train, Y_train)
y_pred = nb.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8754    0.9542    0.9131      7868
1             1    0.9497    0.8641    0.9049      7868
2      accuracy                        0.9092     15736
3     macro avg    0.9125    0.9092    0.9090     15736
4  weighted avg    0.9125    0.9092    0.9090     15736
5       overall  0.949714  0.864133  0.904905  0.909189
CPU times: user 1.27 s, sys: 728 ms, total: 1.99 s
Wall time: 2.1 s


In [14]:
# Save models
import pickle
model_name = './output_baseline/cb12_nb.pikle'
obj = pickle.dump(nb, open(model_name, "wb"))

## Decision Tree

In [15]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
y_pred = dt.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8419    0.8454    0.8437      7868
1             1    0.8448    0.8413    0.8430      7868
2      accuracy                        0.8434     15736
3     macro avg    0.8434    0.8434    0.8434     15736
4  weighted avg    0.8434    0.8434    0.8434     15736
5       overall  0.844799  0.841256  0.843024  0.843353


In [16]:
# Save models
import pickle
model_name = './output_baseline/cb12_dt.pikle'
obj = pickle.dump(dt, open(model_name, "wb"))

## Random Forest

In [17]:
%%time
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8759    0.9487    0.9108      7868
1             1    0.9440    0.8655    0.9031      7868
2      accuracy                        0.9071     15736
3     macro avg    0.9099    0.9071    0.9069     15736
4  weighted avg    0.9099    0.9071    0.9069     15736
5       overall  0.943998  0.865531  0.903063  0.907092
CPU times: user 11min 7s, sys: 3.95 s, total: 11min 11s
Wall time: 11min 13s


In [18]:
# Save models
import pickle
model_name = './output_baseline/cb12_rf.pikle'
obj = pickle.dump(rf, open(model_name, "wb"))

## AdaBoost

In [10]:
%%time
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train, Y_train)
y_pred = ada.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8750    0.9542    0.9129      7868
1             1    0.9497    0.8636    0.9046      7868
2      accuracy                        0.9089     15736
3     macro avg    0.9123    0.9089    0.9087     15736
4  weighted avg    0.9123    0.9089    0.9087     15736
5       overall  0.949686  0.863625  0.904613  0.908935
CPU times: user 8min 50s, sys: 27.1 s, total: 9min 17s
Wall time: 9min 21s


In [11]:
# Save models
import pickle
model_name = './output_baseline/cb12_ada.pikle'
obj = pickle.dump(ada, open(model_name, "wb"))

## Linear Discriminant Analysis
https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html

In [17]:
%%time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, Y_train)
y_pred = lda.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8747    0.9544    0.9128      7868
1             1    0.9498    0.8632    0.9045      7868
2      accuracy                        0.9088     15736
3     macro avg    0.9122    0.9088    0.9086     15736
4  weighted avg    0.9122    0.9088    0.9086     15736
5       overall  0.949797  0.863244  0.904454  0.908808
CPU times: user 21.7 s, sys: 3.17 s, total: 24.8 s
Wall time: 15.8 s


In [18]:
# Save models
import pickle
model_name = './output_baseline/cb12_lda.pikle'
obj = pickle.dump(lda, open(model_name, "wb"))

## Quadratic Discriminant Analysis
https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis.html

In [19]:
%%time
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, Y_train)
y_pred = qda.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision   recall  f1-score   support
0             0    0.8751   0.9512    0.9116      7868
1             1    0.9465   0.8643    0.9035      7868
2      accuracy                       0.9077     15736
3     macro avg    0.9108   0.9077    0.9076     15736
4  weighted avg    0.9108   0.9077    0.9076     15736
5       overall  0.946548  0.86426  0.903534  0.907728
CPU times: user 38.2 s, sys: 3.09 s, total: 41.3 s
Wall time: 14.4 s


In [20]:
# Save models
import pickle
model_name = './output_baseline/cb12_qda.pikle'
obj = pickle.dump(qda, open(model_name, "wb"))

## XGBoost RecSys 2017 - Modify objective to binary

In [7]:
import xgboost as xgb
from xgboost import XGBClassifier

xgbt = XGBClassifier(n_estimators=2, 
                    max_depth=2,
                    num_parallel_tree = 25,
                    learning_rate=0.1, 
                    objective='binary:logistic',
                    eval_metric = 'logloss',
                    #base_score = 0.0
                    nthread= 4)

In [8]:
%%time
xgbt.fit(X_train, Y_train)
y_pred = xgbt.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8747    0.9544    0.9128      7868
1             1    0.9498    0.8632    0.9045      7868
2      accuracy                        0.9088     15736
3     macro avg    0.9122    0.9088    0.9086     15736
4  weighted avg    0.9122    0.9088    0.9086     15736
5       overall  0.949797  0.863244  0.904454  0.908808
CPU times: user 3min 4s, sys: 2.89 s, total: 3min 7s
Wall time: 48.7 s


In [9]:
# Save models
import pickle
model_name = './output_baseline/cb12_xgbt.pikle'
obj = pickle.dump(xgbt, open(model_name, "wb"))

In [None]:
# Save models
import pickle
model_name = './output_baseline/cb12_svm.pikle'
obj = pickle.dump(svm, open(model_name, "wb"))