<a href="https://colab.research.google.com/github/admenezes/spam-classifier/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# libraries
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string
import nltk

### Read data

In [None]:
# Load the data
from google.colab import files
f = files.upload()

Saving enron.csv to enron.csv


In [None]:
# reading a csv file, specifiying that values are separated by comma
data = pd.read_csv('enron.csv', header = None, delimiter=',')

In [None]:
data.columns = ['Text', 'Label']

In [None]:
data.head()

Unnamed: 0,Text,Label
0,Subject: christmas tree farm pictures,0
1,"Subject: vastar resources , inc . gary , produ...",0
2,Subject: calpine daily gas nomination - calpin...,0
3,Subject: re : issue fyi - see note below - alr...,0
4,Subject: meter 7268 nov allocation fyi . - - -...,0


In [None]:
# data can only be labeled as 0 = ham or 1 = spam
data.Label.unique()

array([0, 1])

In [None]:
# number of emails in the file
data.shape

(4302, 2)

In [None]:
# using 2 one hot encoded columns to classify if the text is negative or positive based on the label
spam = []
ham = []
for l in data.Label:
    if l == 0: # if ham then neg = 1
        spam.append(0)
        ham.append(1)
    elif l == 1: # # if spam then pos = 1
        spam.append(1)
        ham.append(0)

In [None]:
# column values of Pos and Neg will be from pos and neg
data['Spam']= spam
data['Ham']= ham

In [None]:
# printing out a sample of the data with appropriate pos and neg values
data.head()

Unnamed: 0,Text,Label,Spam,Ham
0,Subject: christmas tree farm pictures,0,0,1
1,"Subject: vastar resources , inc . gary , produ...",0,0,1
2,Subject: calpine daily gas nomination - calpin...,0,0,1
3,Subject: re : issue fyi - see note below - alr...,0,0,1
4,Subject: meter 7268 nov allocation fyi . - - -...,0,0,1


### Clean data

In [None]:
# removing punctuation from the text then saving it in new column called Text_Clean
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))

In [None]:
# tokenizing the text
from nltk import word_tokenize, WordNetLemmatizer
nltk.download('punkt')
tokens = [word_tokenize(sen) for sen in data.Text_Clean] 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# putting all text in lowercase for uniformity
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens] 

In [None]:
# importing stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stoplist = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# removing stopwords from the text
def remove_stop_words(tokens): 
    return [word for word in tokens if word not in stoplist]

In [None]:
filtered_words = [remove_stop_words(sen) for sen in lower_tokens] 

In [None]:
result = [' '.join(sen) for sen in filtered_words] 

In [None]:
# saving processed text into column called text_final
data['Text_Final'] = result

In [None]:
# saving tokenized text into column called text_final
data['tokens'] = filtered_words

In [None]:
data = data[['Text_Final', 'tokens', 'Label', 'Spam', 'Ham']]

In [None]:
data[:4]

Unnamed: 0,Text_Final,tokens,Label,Spam,Ham
0,subject christmas tree farm pictures,"[subject, christmas, tree, farm, pictures]",0,0,1
1,subject vastar resources inc gary production h...,"[subject, vastar, resources, inc, gary, produc...",0,0,1
2,subject calpine daily gas nomination calpine d...,"[subject, calpine, daily, gas, nomination, cal...",0,0,1
3,subject issue fyi see note already done stella...,"[subject, issue, fyi, see, note, already, done...",0,0,1


### Split data into test and train

In [None]:
# splitting the data
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)

In [None]:
# building training vocabulary using tokens
# for TRAINING: finding total amount of words, unique words and max sentence length
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

430834 words total, with a vocabulary size of 38715
Max sentence length is 3475


In [None]:
# for TESTING: finding total amount of words, unique words and max sentence length
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

50592 words total, with a vocabulary size of 9333
Max sentence length is 1514


### Load Google News Word2Vec model

In [None]:
# loading the Google News Word2Vec model for word embeddings
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

--2021-04-16 04:11:48--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.99.165
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.99.165|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz.1’


2021-04-16 04:13:35 (14.8 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz.1’ saved [1647046227/1647046227]



In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

### Get Embeddings

In [None]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [None]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

### Tokenize and Pad sequences

In [None]:
# assigning each word to an integer for training data
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 38714 unique tokens.


In [None]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# saving embeddings from Google to the corresponding integers assigned to the words
# a random value is assigned if the embedding is not available
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(38715, 300)


In [None]:
# assigning each word to an integer for test data
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

### Define CNN

In [None]:
# the text is passed to a CNN as a sequence
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):

 # the embedding matrix is passed to embedding layer
 # this is a new layer for CNN specifically for text classification       
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

# for the 5 filter sizes, convolution and max pooling occurs
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)

# all outputs are concatenated
    l_merge = concatenate(convs, axis=1)

# dropout layer randomly sets some input units to 0 to prevent overfitting
# dense layer has every input node connected to each output node
    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

# binary_crossentropy is a loss function, must use because there is only 1 correct answer 
# adam algorithm is a stochastic gradident descent method
# accuracy metric shoes how accurate the filter is
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [None]:
label_names = ['Spam', 'Ham']

In [None]:
y_train = data_train[label_names].values

In [None]:
x_train = train_cnn_data
y_tr = y_train

In [None]:
# executing CNN function
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 50)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 50, 300)      11614500    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 49, 200)      120200      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 48, 200)      180200      embedding_1[0][0]                
____________________________________________________________________________________________

### Train CNN

In [None]:
# defining number of iterations and batch size
num_epochs = 5
batch_size = 34

In [None]:
# using validation dataset to get early estimate of accuracy
# makes the filter more biased
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Test CNN

In [None]:
# test data
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [None]:
labels = [1, 0]

In [None]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [None]:
# accuracy when using test data
sum(data_test.Label==prediction_labels)/len(prediction_labels)

0.9605568445475638

In [None]:
# first row (0) is number of ham
# second row (1) is number spam
data_test.Label.value_counts()

0    367
1     64
Name: Label, dtype: int64