In [1]:
import os
import csv
import ember
import numpy as np
import pandas as pd
import altair as alt
import lightgbm as lgb
import matplotlib.pylab as plt
from sklearn.metrics import roc_auc_score, roc_curve
_ = alt.renderers.enable('jupyterlab')

In [2]:
parent_dir = os.path.dirname(os.getcwd())
print(str(parent_dir))
data_dir = str(parent_dir) + "/data/ember2018/csv" # change this to where you unzipped the download

/Users/keremgirenes/Courses/Data Privacy Research/ember


## Read Data (CSV)

In [3]:
X_train = pd.read_csv(data_dir + "/X_train.csv", header=None)
y_train = pd.read_csv(data_dir + "/y_train.csv", header=None)
X_test = pd.read_csv(data_dir + "/X_test.csv", header=None)
y_test = pd.read_csv(data_dir + "/y_test.csv", header=None)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(600000, 2381)
(1, 600000)
(200000, 2381)
(1, 200000)


## Model

In [4]:
import warnings
warnings.filterwarnings("ignore")  # "error", "ignore", "always", "default", "module" or "once"

import numpy as np
import pandas as pd
import random
import os
#from imutils import paths
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import f1_score as f1

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier

import tensorflow as tf
import keras
import autokeras as ak
from keras.models import Sequential
from keras.layers import Dense
from tensorflow import keras
from tensorflow.keras.layers import Dense, Input, ReLU, Dropout
from tensorflow.keras.models import Model

Using TensorFlow backend


## Training and Testing

In [6]:
## Use small training units for local testing

from sklearn.model_selection import train_test_split
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train.T, test_size=0.999, random_state=42)
y_test = y_test.T
print("Training data (small) split.")

####

from sklearn.model_selection import RepeatedStratifiedKFold
cv_method = RepeatedStratifiedKFold(n_splits=5,  n_repeats=3, random_state=999)
from sklearn.preprocessing import PowerTransformer

params_NB = {'var_smoothing': np.logspace(-10,10, num=2000)}

gs_NB = GridSearchCV(estimator=GaussianNB(), param_grid=params_NB, cv=cv_method, verbose=1, scoring='accuracy')

classifiers = {# "GaussianNB": gs_NB, 
                "RFC": RandomForestClassifier(), 
                "DT": DecisionTreeClassifier(), 
                "SGD": SGDClassifier(),
                # "SVM": SVC(kernel = 'linear', degree=3), 
                "MLP": MLPClassifier(random_state=1, max_iter=300),
                "LGBM": LGBMClassifier(),
                "XGB": XGBClassifier()} 

for classifier_pair in classifiers.items():
    print("---------------------------")
    print(classifier_pair[0])
    
    classifier = classifier_pair[1]
    classifier.fit(X_train_small, y_train_small)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)

    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print('Confusion Matrix', cm)

    #compute accuracy_score
    accuracy = acc(y_test, y_pred)
    print('accuracy', accuracy)

    #compute precision score
    precision_score = precision(y_test, y_pred, average='micro')
    print('precision', precision_score)

    #compute recall score
    recall_score = recall(y_test, y_pred, average='micro')
    print('recall', recall_score)

    #compute f1 score
    f1_score = f1(y_test, y_pred, average='micro')
    print('f1', f1_score)
    
print("---------------------------")

Training data (small) split.
---------------------------
RFC
Confusion Matrix [[73703 26297]
 [11079 88921]]
accuracy 0.81312
precision 0.81312
recall 0.81312
f1 0.81312
---------------------------
DT
Confusion Matrix [[67171 32829]
 [18008 81992]]
accuracy 0.745815
precision 0.745815
recall 0.745815
f1 0.745815
---------------------------
SGD
Confusion Matrix [[ 2047 97953]
 [  233 99767]]
accuracy 0.50907
precision 0.50907
recall 0.50907
f1 0.50907
---------------------------
MLP
Confusion Matrix [[21236 78764]
 [ 2237 97763]]
accuracy 0.594995
precision 0.594995
recall 0.594995
f1 0.594995
---------------------------
LGBM
[LightGBM] [Info] Number of positive: 303, number of negative: 297
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 110576
[LightGBM] [Info] Number of data points in the train set: 600, number of used features: 1405
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.505000 -> initscore=0.020001
[LightGBM] [Info] Start training f

Confusion Matrix [[74193 25807]
 [ 9462 90538]]
accuracy 0.823655
precision 0.823655
recall 0.823655
f1 0.823655
---------------------------
XGB
Confusion Matrix [[73852 26148]
 [ 9827 90173]]
accuracy 0.820125
precision 0.820125
recall 0.820125
f1 0.820125
---------------------------


## AutoKeras Model

In [None]:
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
    overwrite=True, max_trials=10
)  # It tries 3 different models.
# Feed the structured data classifier with training data.
clf.fit(
    x=X_train_small,
    y=y_train_small,
    epochs=15,
)

# Evaluate the best model with testing data.
print(clf.evaluate(x=X_test, y=y_test))

In [None]:
model = clf.export_model()
model.summary()