In [146]:
# Breanna Powell and Melody Behdarvandian
# CSS 576
# Assignment 3

# IDE: Visual Studio Code 

# -------------------------------------------------------------------------------------------------
#                    FOLLOW THESE STEPS TO INSTALL TENSORFLOW in VS CODE
# -------------------------------------------------------------------------------------------------
# 
# https://docs.anaconda.com/anaconda/user-guide/tasks/tensorflow/
# 1) Use the commands:
#    $ conda create -n tf tensorflow
#    $ conda activate tf
#
# https://code.visualstudio.com/docs/datascience/jupyter-notebooks#_create-or-open-a-jupyter-notebook
# 2) In the upper right hand corner, switch the kernel from "base" over to "tf(Python 3.10.9)"
#
# 3) Close this document and reopen it from Anaconda Navigator, but instead of "base" select "tf" from the dropdown menu
# 4) Install the following:
#    $ conda install pandas matplotlib scikit-learn seaborn
#    $ conda install -c conda-forge tensorflow keras
# -------------------------------------------------------------------------------------------------

import numpy as np
import pandas as pd

emailData = pd.read_csv('emails.csv')

## DATA EXPLORATION

In [147]:
emailData.shape # Check the size of the dataset

(5572, 5)

In [148]:
# Check to see if there are missing values (NaN or null)
emailData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       5572 non-null   object
 1   email       5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [149]:
emailData.head()

Unnamed: 0,label,email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# PRE-PROCESSING TECHNIQUES

In [150]:
# Replace labels with 0 for ham and 1 for spam
emailData["label"] = (emailData["label"] == "spam").astype(int)

# Check to make sure it worked
emailData.head()

Unnamed: 0,label,email,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [151]:
# Separate the features (x) from the labels (y)
x = emailData['email']
y = emailData["label"]

In [152]:
# Apply a count vectorizer to the training data to convert from text to token counts
# Count vectorizer -- will it ensure the testing data has no impact on the training data's normalization?
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() # LOOK FOR OTHER PARAMETERS TO USE
features = cv.fit_transform(x)
features.get_shape()

(5572, 8673)

In [153]:
# Imputing missing values with the most frequent value
# https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
from sklearn.impute import SimpleImputer
imputedEmails = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputedEmails.fit_transform(features)
imputedEmails.transform(features)

<5572x8673 sparse matrix of type '<class 'numpy.int64'>'
	with 73917 stored elements in Compressed Sparse Column format>

In [154]:
# Features is now in csr format (rather than a series or dataframe)
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
type(features)
x = pd.DataFrame(features.toarray()) # convert it to an dataframe instead

In [155]:
# Checking to make sure that there are no NaN's after using the imputer
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html
x.isna().sum().sum()

# Should print 0

0

# Feature Reduction

# Split into Train and Test

In [156]:
# Split into training and testing data 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [157]:
x_train.shape

(4457, 8673)

In [158]:
len(x_train)

4457

In [159]:
len(y_train)

4457

# Neural Network

In [160]:

# https://www.kdnuggets.com/2018/06/basic-keras-neural-network-sequential-model.html

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

num_classes = 2 # 2 classes - spam and ham
train_shape = x_train.shape
filter_size = 32 # Should be a power of 2

a = train_shape[0]
b = train_shape[1]
# input_shape = (a, b)
# input_shape = (a, b, 1)
input_shape = len(x_train)
# input_shape = (b)

print(input_shape)

4457


In [161]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(8, input_dim = len(x_train), activation = 'relu'))
model.add(Dense(4, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid')) # Always use a sigmoid function for final layer when performing binary classification.

In [162]:
# https://keras.io/api/models/sequential/

# model = keras.Sequential(
#     [
#         keras.Input(shape=input_shape),

#         #https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2D
#         layers.Conv2D(32, kernel_size=(3, 3), activation="relu"), 
#         layers.MaxPooling2D(pool_size=(2, 2)),
#         layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
#         layers.MaxPooling2D(pool_size=(2, 2)),
#         layers.Flatten(),  # Turns it into a 1D array
#      #The dropout makes the model forget some of what it learned to prevent overfitting
#      #Do not set the drop out at .99 if you want an accurate model
#         #layers.Dropout(0.5), # ORIGINAL DROPOUT
#         #layers.Dropout(0.2), 
#         #layers.Dropout(0.99), # BAD ACCURACY
#         #layers.Dense(num_classes, activation="softmax")
#         layers.Dense(num_classes, activation="sigmoid")
#     ]
# )

In [163]:
model.summary() #This tells us what was in this network
# batch_size = 128
epochs = 1  #Should be much greater, but this is just to play around

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 8)                 35664     
                                                                 
 dense_36 (Dense)            (None, 4)                 36        
                                                                 
 dense_37 (Dense)            (None, 1)                 5         
                                                                 
Total params: 35,705
Trainable params: 35,705
Non-trainable params: 0
_________________________________________________________________


In [164]:
from keras.optimizers import Adam, SGD, Adagrad, Adadelta, RMSprop
# model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
# model.compile(optimizer=Adam(lr=0.001), loss="categorical_crossentropy",  metrics=["accuracy"])
model.compile(loss="binary_crossentropy",  metrics=["accuracy"])
# model.compile(optimizer="adam", metrics=["accuracy"])
# model.fit(x = x_train, y = y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
model.fit(x = x_train, y = y_train, epochs=epochs, validation_split=0.1) # ERROR IS HERE!!!!!!!!!!!!!!!!!!!!!

  return t[start:end]


ValueError: in user code:

    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_24" is incompatible with the layer: expected shape=(None, 4457), found shape=(None, 8673)


In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

ValueError: in user code:

    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1727, in test_function  *
        return step_function(self, iterator)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1713, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1701, in run_step  **
        outputs = model.test_step(data)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\training.py", line 1665, in test_step
        y_pred = self(x, training=False)
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\brely\anaconda3\envs\tf\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_9" is incompatible with the layer: expected shape=(None, 4457, 8673, 1), found shape=(None, 8673)


# Metrics

In [None]:
from sklearn.metrics import f1_score, precision_score,recall_score

# Visual

In [None]:
# https://towardsdatascience.com/spam-detection-in-emails-de0398ea3b48

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# cf_matrix =confusion_matrix(y_test,y_predict)

# ax= plt.subplot()
# #annot=True to annotate cells
# sns.heatmap(cf_matrix, annot=True, ax = ax,cmap='Blues',fmt='');
# # labels, title and ticks
# ax.set_xlabel('Predicted labels');
# ax.set_ylabel('True labels');
# ax.set_title('Confusion Matrix');
# ax.xaxis.set_ticklabels(['Not Spam', 'Spam']); ax.yaxis.set_ticklabels(['Not Spam', 'Spam']);