In [1]:
# import statements
import pandas as pd
import matplotlib.pyplot as pyplot
import livelossplot
import numpy as np

import os
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.get_logger().setLevel('ERROR')

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras import models
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
from keras import backend as K

2023-02-28 16:15:09.917144: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# functions
# https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model


def recall_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_metric(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_metric(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# wrap the model building in a function so that it can be passed to the Keras Classifier
def model_build():
    nn = models.Sequential()
    # nn.add(Dense(8, activation='relu', input_dim=dyn_in_dim)) # 8 neurons
    nn.add(Dense(8, activation='relu', input_shape=(dyn_in_dim,))) # 8 neurons
    nn.add(Dropout(0.5)) # let the model forget some stuff (avoid overfitting)
    nn.add(Dense(4, activation='relu')) # 4 neurons
    nn.add(Dense(1, activation='sigmoid')) # 1 neuron
    nn.add(Dropout(0.5)) # let the model forget some stuff (avoid overfitting)
    nn.add(Dense(4, activation='relu')) # 4 neurons
    
    # compile the model
    nn.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc', precision_metric])
    return nn

def get_results(results):
    print(f"Highest mean score of {results.best_score_:.2f} was achieved using {results.best_params_}\n")
    means = results.cv_results_['mean_test_score']
    params = results.cv_results_['params']
    n_scores = len(means)
    print("List of combinations:")
    for i in range(n_scores):
        print(f"mean of {means[i]:.2f} using {params[i]}")

In [3]:
# get data
df = pd.read_csv('email_features.csv')

# get a random sample
np.random.seed(42) # set up random permutation
rndperm = np.random.permutation(df.shape[0])
n = 1000
df = df.loc[rndperm[:n],:]
print('df shape: ', df.shape)
df.head()

df shape:  (1000, 17)


Unnamed: 0,Subject,To,From,Date,CC,BCC,Message ID,In Reply To,References,Reply To,Sender,Received,Content Type,Content Encoding,Content Disposition,Body,Body Length
2882,日本銀行データ通信システム、代表理事　高木と申します。,bruce@bruce-guenter.dyndns.org,recent@q3byryq3rnqvi.link,"Sun, 01 May 2016 16:25:31 +0900",,,468874259_7727598571052417dabd@q3byryq3rnqvi.link,,,,,(qmail 23393 invoked from network); 1 May 2016...,"text/plain; charset=""Shift_JIS""",,,【日本銀行データ通信システム、代表理事　高木と申します。】\r\n\r\nhttp://q3...,308
3933,今日、携帯を買ってもらった現役の女子高生！！＼(^o^)／,lists-reiserfs-list@bruce-guenter.dyndns.org,mailhs5z@3bn40m28tx12neoc.shop,"Mon, 02 Apr 2018 09:24:59 +0900",,,<eQIIOUAIvAsFPI3c@3bn40m28tx12neoc.shop>,,,,,(qmail 20132 invoked from network); 2 Apr 2018...,"text/plain; charset=""Shift_JIS""",,,http://0gfvmvt81p7mniks.shop/syCVG80lQpK6gBUn,45
304,SHOOTERS: This thing aims the gun for you (Hur...,bruce@untroubled.org,Red Dot <RedDot@reddot.icu>,"Mon, 29 Apr 2019 12:14:09 -0400",,,<ezt9my6wvebanid5-wct4hljdxdfkho8r-217-127ae@r...,,,Red Dot <RedDot@reddot.icu>,,(qmail 25017 invoked from network); 29 Apr 201...,"multipart/alternative; boundary=""9753b2b11066a...",,,SHOOTERS: This thing aims the gun for you (Hur...,3944
3325,Vertigo and dizziness is just a normal part of...,bruce@untroubled.org,Ear Wax Vertigo <BenignVertigo@skinlabx.us>,"Fri, 07 Jan 2022 09:10:08 -0500",,,<d4ybhxik3nfzh55o-fa8ixs2c5zu1qapv-252e-381@sk...,,,Benign Vertigo <MigraineDizziness@skinlabx.us>,,,"multipart/alternative; boundary=""d8ff973dc81e3...",,,Vertigo and dizziness is just a normal part of...,2296
2626,��� ����/������ ���� ������ �ֽ��ϴ�. tx dc,bruce@untroubled.org,�������̻� <ha1g2s2h@nate.com>,"Wed, 02 Mar 2022 18:35:18 -0700",,,<86a64gdyq5igj$0m6e77$$fvf9@8xgz.h7ww>,,,�������̻� <ha1g2s2h@nate.com>,,from [135.161.247.24]\tby 45.63.65.23 with ESM...,"multipart/alternative; boundary=""..._9FD.F7AD8...",,,"<p>����������. ��������~~~!!\n <div>����,���...",576


In [4]:
# encode data numerically
onehot = OneHotEncoder(dtype=int, sparse=False)
df = onehot.fit_transform(df)
# convert x back into a dataframe
df = pd.DataFrame(df)

In [5]:
# slice data
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
print('x shape: ', x.shape, '\ny shape: ', y.shape)

x shape:  (1000, 6904) 
y shape:  (1000,)


In [6]:
# create simple classification: if body length < 1000, call it spam
y = y.apply(lambda x: 1 if x < 1000 else 0) # lambda transforms all instances in the pandas series (ie, y)
y.head()

0    1
1    1
2    1
3    1
4    1
Name: 6904, dtype: int64

In [7]:
# get dynamic input dimensions
dyn_in_dim = x.shape[1]

In [8]:
# set up early stopping
es = EarlyStopping(
    monitor="loss",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=False,
)

In [9]:
# create parameter grid (based on the fit method's hyperparameters)
param_grid = {
    'epochs': [5,10,25,50,100],
    'batch_size': [16,32,64,128,256,512],
}

In [10]:
# create the KerasClassifier
model = KerasClassifier(build_fn=model_build, verbose=0)

In [11]:
# set up the grid
grid = GridSearchCV(estimator=model, 
                    param_grid=param_grid, 
                    cv=3,
                    verbose=2)

In [12]:
# set up random search
random = RandomizedSearchCV(estimator=model, 
                            param_distributions=param_grid, 
                            cv=10,
                            verbose=2)

In [13]:
# GridSearch: train the model with early stopping (in callbacks)
grid_result = grid.fit(x, y, callbacks=[es])

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] END ............................batch_size=16, epochs=5; total time=   1.4s
[CV] END ............................batch_size=16, epochs=5; total time=   1.4s
[CV] END ............................batch_size=16, epochs=5; total time=   1.2s
[CV] END ...........................batch_size=16, epochs=10; total time=   1.4s
[CV] END ...........................batch_size=16, epochs=10; total time=   1.5s
[CV] END ...........................batch_size=16, epochs=10; total time=   1.5s
[CV] END ...........................batch_size=16, epochs=25; total time=   2.0s
[CV] END ...........................batch_size=16, epochs=25; total time=   1.5s
[CV] END ...........................batch_size=16, epochs=25; total time=   1.7s
[CV] END ...........................batch_size=16, epochs=50; total time=   2.3s
[CV] END ...........................batch_size=16, epochs=50; total time=   2.0s
[CV] END ...........................batch_size=1

In [None]:
# RandomizedSearch: train the model with early stopping (in callbacks)
random_result = random.fit(x, y, callbacks=[es])

In [14]:
# check random results
print("Grid Search Results:\n")
get_results(grid_result)

GridSearch Results:

Highest mean score of 1.00 was achieved using {'batch_size': 16, 'epochs': 5}

List of combinations:
mean of 1.00 using {'batch_size': 16, 'epochs': 5}
mean of 1.00 using {'batch_size': 16, 'epochs': 10}
mean of 1.00 using {'batch_size': 16, 'epochs': 25}
mean of 1.00 using {'batch_size': 16, 'epochs': 50}
mean of 1.00 using {'batch_size': 16, 'epochs': 100}
mean of 0.67 using {'batch_size': 32, 'epochs': 5}
mean of 1.00 using {'batch_size': 32, 'epochs': 10}
mean of 1.00 using {'batch_size': 32, 'epochs': 25}
mean of 1.00 using {'batch_size': 32, 'epochs': 50}
mean of 1.00 using {'batch_size': 32, 'epochs': 100}
mean of 0.00 using {'batch_size': 64, 'epochs': 5}
mean of 1.00 using {'batch_size': 64, 'epochs': 10}
mean of 1.00 using {'batch_size': 64, 'epochs': 25}
mean of 1.00 using {'batch_size': 64, 'epochs': 50}
mean of 1.00 using {'batch_size': 64, 'epochs': 100}
mean of 0.67 using {'batch_size': 128, 'epochs': 5}
mean of 0.63 using {'batch_size': 128, 'epochs

In [15]:
# check random results
print("Randomized Search Results:\n")
get_results(random_result)

RandomizedSearch Results:

Highest mean score of 1.00 was achieved using {'epochs': 100, 'batch_size': 16}

List of combinations:
mean of 0.60 using {'epochs': 10, 'batch_size': 128}
mean of 0.40 using {'epochs': 10, 'batch_size': 256}
mean of 1.00 using {'epochs': 100, 'batch_size': 16}
mean of 1.00 using {'epochs': 100, 'batch_size': 128}
mean of 1.00 using {'epochs': 25, 'batch_size': 64}
mean of 1.00 using {'epochs': 5, 'batch_size': 16}
mean of 0.90 using {'epochs': 25, 'batch_size': 16}
mean of 0.40 using {'epochs': 5, 'batch_size': 256}
mean of 1.00 using {'epochs': 100, 'batch_size': 256}
mean of 1.00 using {'epochs': 50, 'batch_size': 64}
