In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

##  Step 1: Load a Data Set and Save it as a Pandas DataFrame

In [3]:
filename = os.path.join(os.getcwd(), "data", "spamDataset.csv")
df = pd.read_csv(filename, header=0)

df.head()

Unnamed: 0,email_text,spam
0,Subject: enron methanol ; meter # : 988291\r\n...,False
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",False
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",False
3,"Subject: photoshop , windows , office . cheap ...",True
4,Subject: re : indian springs\r\nthis deal is t...,False


## Step 2: Create Labeled Examples from the Data Set

In [4]:
y = df['spam']
X = df['email_text']

X.shape

(5170,)

In [5]:
X.head()

Unnamed: 0,email_text
0,Subject: enron methanol ; meter # : 988291\r\n...
1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,"Subject: photoshop , windows , office . cheap ..."
4,Subject: re : indian springs\r\nthis deal is t...


Example of a spam email and an email that is not spam

In [6]:
print('A spam email: \n\n', X[67])
print('A non-spam email: \n\n', X[135])

A spam email: 

 Subject: re : husband soup would be
as you know election time is not the best thing for the economy .
economy is in a very unstable condition , as you can see gas prices
are going up along with the m o rtgvage rat e s . once the
r a te goes up you will not have a chance to s av e money again
for a very long time .
it is your last chance . get r e f inanced at 4 . 2 point !
http : / / www . fintod . com /
- -
despoil , compote a amende
the me orbital irruption
gfawn a ax henrietta
a the in boatswain
out whither the accompanist lint macintosh

A non-spam email: 

 Subject: re : tuesday , december 26 th
i will be here tuesday , also .
mark mccoy
12 / 20 / 2000 09 : 04 am
to : michael olsen / na / enron @ enron , tom acton / corp / enron @ enron , clem
cernosek / hou / ect @ ect , robert cotten / hou / ect @ ect , jackie young / hou / ect @ ect ,
sabrae zajac / hou / ect @ ect , carlos j rodriguez / hou / ect @ ect , mark
mccoy / corp / enron @ enron ,

## Step 3: Preprocess the Text

In [23]:
import gensim

In [8]:
original_X = X
X = X.apply(lambda row: gensim.utils.simple_preprocess(row))

In [9]:
X.head()

Unnamed: 0,email_text
0,"[subject, enron, methanol, meter, this, is, fo..."
1,"[subject, hpl, nom, for, january, see, attache..."
2,"[subject, neon, retreat, ho, ho, ho, we, re, a..."
3,"[subject, photoshop, windows, office, cheap, m..."
4,"[subject, re, indian, springs, this, deal, is,..."


In [10]:
original_X.head()

Unnamed: 0,email_text
0,Subject: enron methanol ; meter # : 988291\r\n...
1,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,"Subject: photoshop , windows , office . cheap ..."
4,Subject: re : indian springs\r\nthis deal is t...


## Step 4: Create Training and Test Data Sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.80, random_state=1234)

X_train.head()

Unnamed: 0,email_text
562,"[subject, hpl, nom, for, sept, see, attached, ..."
148,"[subject, copanno, changes, forwarded, by, ami..."
4831,"[subject, re, maynard, oil, revised, nom, dare..."
3385,"[subject, deal, for, december, can, either, of..."
2389,"[subject, heisse, sx, action, hallo, mein, lie..."


## Step 5: Training the Word2Vec Model and Inspecting the Word Embeddings

* size: dimension of each resulting vector that will represent each word.
* window: the number of words behind or ahead of a target word that will be used to provide context for that word
* min_count: the number of times a word must appear in our text document in order to create a word vector. The model will ignore words that do not satisfy the `min_count` specification, therefore ignoring words that are not important.

In [12]:
print("Begin")
word2vec_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

print("End")

Begin
End


In [13]:
len(word2vec_model.wv.key_to_index)  # retrieve vocabulary and measure its size

18015

## Step 6: Create Feature Vectors out of Word Embeddings for a Classifier

In [14]:
X_train.head()

Unnamed: 0,email_text
562,"[subject, hpl, nom, for, sept, see, attached, ..."
148,"[subject, copanno, changes, forwarded, by, ami..."
4831,"[subject, re, maynard, oil, revised, nom, dare..."
3385,"[subject, deal, for, december, can, either, of..."
2389,"[subject, heisse, sx, action, hallo, mein, lie..."


In [15]:
X_test.head()

Unnamed: 0,email_text
5075,"[subject, hpl, nom, for, may, see, attached, f..."
3817,"[subject, cleburne, tenaska, iv, plant, daren,..."
2967,"[subject, re, producer, connects, on, the, con..."
3014,"[subject, mail, hey, daren, is, this, your, ma..."
2185,"[subject, revision, forest, oil, november, gas..."


In [16]:
words = set(word2vec_model.wv.index_to_key)

print('Begin transforming X_train')
X_train_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                        for training_example in X_train], dtype=object)
print('Finish transforming X_train')

print('Begin transforming X_test')
X_test_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                        for training_example in X_test], dtype=object)
print('Finish transforming X_test')

Begin transforming X_train
Finish transforming X_train
Begin transforming X_test
Finish transforming X_test


In [17]:
print('Number of words in first training example: {0}'.format(len(X_train.iloc[0])))
print('First word in first training example: {0}'.format(X_train.iloc[0][0]))
print('Second word in first training example: {0}\n'.format(X_train.iloc[0][1]))

print('First word vector in first training example:\n {0}'.format(X_train_word_embeddings[0][0]))
print('\nSecond word vector in first training example: \n {0}\n'.format(X_train_word_embeddings[0][1]))


Number of words in first training example: 12
First word in first training example: subject
Second word in first training example: hpl

First word vector in first training example:
 [-0.5830294   0.26648676 -0.17418745 -0.6508297   0.17991309 -0.43210682
  0.24908517  1.3143828  -0.45265633 -0.48573413 -0.5473619  -0.91590846
  0.17094237 -0.2243076   0.25457758 -0.8009194   0.06790558 -0.54916674
  0.26852894 -0.87041545  0.9626919   0.5477013   0.3171348  -0.30841976
 -0.09730385  0.29818127 -0.9116429  -0.58347476 -0.5814771  -0.05723939
 -0.15337953  0.09529152  0.28491026 -0.6787923   0.2541098   0.65875345
 -0.21031994 -0.26048505 -0.13544951 -0.98579776  0.01115817  0.27915508
 -0.5110806   0.23631197  0.72715926 -0.5090099  -0.44776857 -0.35734907
  0.3916944   0.65690726  0.24637932 -0.30662492 -0.7989687  -0.3785071
 -0.36952832  0.45433134  0.33764812  0.10720887 -0.6627421   0.1114152
 -0.21397664  0.29842535  0.20470521 -0.06714604 -1.0909793   0.31735903
  0.5883857   0.4

In [18]:
print('Number of word vectors in first five examples in training set:')
for w in range(0, 5):
    print(len(X_train_word_embeddings[w]))

print('Number of word vectors in first five examples in test set:')
for w in range(0, 5):
    print(len(X_test_word_embeddings[w]))

Number of word vectors in first five examples in training set:
10
23
37
57
30
Number of word vectors in first five examples in test set:
10
31
59
8
59


This will cause an error when we train our model. We have to create feature vectors that will provide our classifier with a consistent set of features per example.

We can take an element-wise average of the word embeddings of the words contained in each training and test example. This makes feature vector representations that can be used as training and test features for our classifier.

In [19]:
X_train_feature_vector = []
for w in X_train_word_embeddings:
    if w.size:
        X_train_feature_vector.append(w.mean(axis=0))
    else:
        X_train_feature_vector.append(np.zeros(100, dtype=float))

X_test_feature_vector = []
for w in X_test_word_embeddings:
    if w.size:
        X_test_feature_vector.append(w.mean(axis=0))
    else:
        X_test_feature_vector.append(np.zeros(100, dtype=float))

Each example now consists of one feature, which is a numerical feature vector of length 100.

## Step 7: Fit a Logistic Regression Model to the Training Data and Evaluate the Model

Now we can train our model on our transformed data. The code cell below trains a logistic regression model and computes the AUC on the test set.

In [20]:
# 1. Create a LogisticRegression model object, and fit a Logistic Regression model to the transformed training data
model = LogisticRegression(max_iter=200)
model.fit(X_train_feature_vector, y_train)

# 2. Make predictions on the transformed test data using the predict_proba() method and
# save the values of the second column
probability_predictions = model.predict_proba(X_test_feature_vector)[:,1]

# 3. Make predictions on the transformed test data using the predict() method
class_label_predictions = model.predict(X_test_feature_vector)

# 4. Compute the Area Under the ROC curve (AUC) for the test data.
auc = roc_auc_score(y_test, probability_predictions)
print('AUC on the test data: {:.4f}'.format(auc))

AUC on the test data: 0.9868


In [21]:
print('Email #1:\n')
print(original_X[14])

print('\nPrediction: Is this a spam email? {}\n'.format(class_label_predictions[14]))

print('Actual: Is this a spam email? {}\n'.format(y_test.to_numpy()[14]))

Email #1:

Subject: tenaska iv july
darren :
please remove the price on the tenaska iv sale , deal 384258 , for july and enter the demand fee . the amount should be $ 3 , 902 , 687 . 50 .
thanks ,
megan

Prediction: Is this a spam email? True

Actual: Is this a spam email? True



In [22]:
print('Email #2:\n')
print(original_X[132])

print('\nPrediction: Is this a spam email? {}\n'.format(class_label_predictions[132]))

print('Actual: Is this a spam email? {}\n'.format(y_test.to_numpy()[132]))

Email #2:

Subject: re : noms / actual flow for 3 / 29 / 01
we agree with the nomination .
" eileen ponton " on 03 / 30 / 2001 10 : 05 : 40 am
to : david avila / lsp / enserch / us @ tu , charlie stone / texas utilities @ tu , melissa
jones / texas utilities @ tu , hpl . scheduling @ enron . com ,
liz . bellamy @ enron . com
cc :
subject : noms / actual flow for 3 / 29 / 01
nom mcf mmbtu
24 , 583 24 , 999 25 , 674
btu = 1 . 027

Prediction: Is this a spam email? False

Actual: Is this a spam email? False

