In [61]:
# Breanna Powell and Melody Behdarvandian
# CSS 576
# Assignment 3

# IDE: Visual Studio Code 

# -------------------------------------------------------------------------------------------------
#                    FOLLOW THESE STEPS TO INSTALL TENSORFLOW in VS CODE
# -------------------------------------------------------------------------------------------------
# 
# https://docs.anaconda.com/anaconda/user-guide/tasks/tensorflow/
# 1) Use the commands:
#    $ conda create -n tf tensorflow
#    $ conda activate tf
#
# https://code.visualstudio.com/docs/datascience/jupyter-notebooks#_create-or-open-a-jupyter-notebook
# 2) In the upper right hand corner, switch the kernel from "base" over to "tf(Python 3.10.9)"
#
# 3) Close this document and reopen it from Anaconda Navigator, but instead of "base" select "tf" from the dropdown menu
# 4) Install the following:
#    $ conda install pandas matplotlib scikit-learn seaborn
#    $ conda install -c conda-forge tensorflow keras
# -------------------------------------------------------------------------------------------------

import numpy as np
import pandas as pd

emailData = pd.read_csv('emails.csv')

## DATA EXPLORATION

In [62]:
emailData.shape # Check the size of the dataset

(5572, 5)

In [63]:
# Check to see if there are missing values (NaN or null)
emailData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   email       5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [64]:
emailData.head()

Unnamed: 0,label,email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# PRE-PROCESSING TECHNIQUES

In [65]:
# Replace labels with 0 for ham and 1 for spam
emailData["label"] = (emailData["label"] == "spam").astype(int)

# Check to make sure it worked
emailData.head()

Unnamed: 0,label,email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [66]:
# Separate the features (x) from the labels (y)
x = emailData['email']
y = emailData["label"]

In [67]:
# Apply a count vectorizer to the training data to convert from text to token counts
# Count vectorizer -- will it ensure the testing data has no impact on the training data's normalization?
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # LOOK FOR OTHER PARAMETERS TO USE
features = cv.fit_transform(x)
features.get_shape()

(5572, 8673)

In [68]:
# Imputing missing values with the most frequent value
# https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
from sklearn.impute import SimpleImputer
imputedEmails = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputedEmails.fit_transform(features)
imputedEmails.transform(features)

<5572x8673 sparse matrix of type '<class 'numpy.int64'>'
	with 73917 stored elements in Compressed Sparse Column format>

In [69]:
# Features is now in csr format (rather than a series or dataframe)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
type(features)
x = pd.DataFrame(features.toarray()) # convert it to an dataframe instead

In [70]:
# Checking to make sure that there are no NaN's after using the imputer
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html
x.isna().sum().sum()

# Should print 0

0

# Feature Reduction

# Split into Train and Test

In [71]:
# Split into training and testing data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [72]:
x_train.shape

(4457, 8673)

In [73]:
len(x_train)

4457

In [74]:
len(y_train)

4457

# Neural Network

In [75]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

num_classes = 1 # - spam or ham
train_shape = x_train.shape
filter_size = 32 # Should be a power of 2

b = train_shape[1]
input_shape = (b,)

print(input_shape)

(8673,)


In [76]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
# https://keras.io/api/models/sequential/


# Activation function options:
# - relu
# - sigmoid - good for last layer of binary classification
# - elu - need to research this one

model = Sequential()
model.add(Dense(8, input_shape= input_shape, activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid')) # Always use a sigmoid function for final layer when performing binary classification.

In [87]:
# https://keras.io/api/callbacks/early_stopping/
# https://machinelearningmastery.com/early-stopping-to-avoid-overtraining-neural-network-models/
# Monitor options:
#  - loss
#  - accuracy
#  - val_loss
#  - val_binary_crossentropy
# Mode options:
#  - min


callback = keras.callbacks.EarlyStopping(monitor='accuracy', patience=3) # stops if the accuracy gets high
# callback = keras.callbacks.EarlyStopping(monitor='loss', patience=3) # monitors loss if the performance goes down

In [88]:
model.summary() #This tells us what was in this network

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 8)                 69392     
                                                                 
 dense_8 (Dense)             (None, 4)                 36        
                                                                 
 dense_9 (Dense)             (None, 1)                 5         
                                                                 
Total params: 69,433
Trainable params: 69,433
Non-trainable params: 0
_________________________________________________________________


In [89]:
# https://www.geeksforgeeks.org/choose-optimal-number-of-epochs-to-train-a-neural-network-in-keras/
epochs = 50

In [90]:
# https://keras.io/api/losses/
# https://www.tensorflow.org/api_docs/python/tf/keras/losses  <---------- TO DO: LOOK THROUGH THESE
# Loss parameter options: 
# - binary_crossentropy # https://keras.io/api/losses/probabilistic_losses/#binarycrossentropy-class
# - ?????

# https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
# https://www.tensorflow.org/api_docs/python/tf/keras/metrics
# metrics can be a list, like this: metrics=["mae", "acc"]
# Metrics parameter options:
# - accuracy
# - precision
# - recall

# TO DO: decide if we should use any optimizers (rmsprop, adam, sgd, adagrad, adadelta)

from keras.optimizers import Adam, SGD, Adagrad, Adadelta, RMSprop
model.compile(loss="binary_crossentropy",  metrics=["accuracy"])
model.fit(x = x_train, y = y_train, epochs=epochs, callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


4

In [91]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

Test loss: 0.2559967637062073
Test accuracy: 0.9865471124649048


# Metrics

In [94]:
from sklearn.metrics import f1_score, precision_score,recall_score

y_predict = model.predict(x_test)


# I wonder about this metric? -- it uses spam and ham in the example!
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.brier_score_loss.html#sklearn.metrics.brier_score_loss



# Visual

In [95]:
# https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48

# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# cf_matrix =confusion_matrix(y_test,y_predict)

# ax= plt.subplot()
# #annot=True to annotate cells
# sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');
# # labels, title and ticks
# ax.set_xlabel('Predicted labels');
# ax.set_ylabel('True labels');
# ax.set_title('Confusion Matrix');
# ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);

ValueError: Classification metrics can't handle a mix of binary and continuous targets