In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model

In [2]:
def modelStats(actual, predict):
  return metrics.accuracy_score(actual, predict), metrics.precision_score(actual, predict), metrics.recall_score(actual, predict)

In [3]:
full_dataset = pd.read_csv('spam_ham_dataset.csv')
full_dataset.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [4]:
# Check whether the label and label_num match up
np.sum((full_dataset['label'] == 'spam').astype(int) != full_dataset['label_num']) == 0

True

In [5]:
# Check counts of spam and ham emails
full_dataset['label'].value_counts()

label
ham     3672
spam    1499
Name: count, dtype: int64

Split into Training and Test Set

In [64]:
# Split Dataset into training and test set
x_data, y_data = full_dataset['text'], full_dataset['label_num']
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.1, random_state=41)
len(x_train), len(x_test)

(4653, 518)

In [65]:
# Check how well distributed spam and ham emails are in train and test sets
prop_spam_train, prop_spam_test = y_train.value_counts().loc[1] / len(y_train) , y_test.value_counts().loc[1] / len(y_test)
print('Proportion of spam emails in training set: ', prop_spam_train)
print('Proportion of spam emails in test set: ', prop_spam_test)

Proportion of spam emails in training set:  0.28841607565011823
Proportion of spam emails in test set:  0.3030888030888031


Feature engineering

In [89]:
# Count vectorizer used to count occurences of words in each email
cv = CountVectorizer(stop_words='english', min_df=100, max_df=4000)
cv.fit(x_train)
x_train_counts = cv.transform(x_train).toarray()
x_train_counts.shape

(4653, 455)

SVM Naive

In [68]:
# Create naive model
model = svm.SVC(kernel='linear')
model.fit(x_train_counts, y_train)

In [70]:
# Naive evaluation of training set training model on full training set
y_train_predict = model.predict(x_train_counts)
modelStats(y_train, y_train_predict)

(0.991833225875779, 0.9745269286754003, 0.9977645305514158)

SVM with cross validation

In [71]:
# Pipeline model to then use to run grid search cross validation
pipe = Pipeline([('count-vec', CountVectorizer(stop_words='english')), ('svm', svm.SVC())])
grid = {
    'count-vec__min_df': [30, 40, 50, 75, 100],
    'count-vec__max_df': [3500, 4000, 4500],
    'count-vec__binary': [True, False],
    'svm__kernel': ['rbf', 'linear', 'sigmoid'],
    'svm__C': [1.0, 15.0, 50.0]
}
grid_search = GridSearchCV(pipe, param_grid=grid)
grid_search.fit(x_train, y_train)

In [72]:
grid_search.best_params_, grid_search.best_score_

({'count-vec__binary': False,
  'count-vec__max_df': 3500,
  'count-vec__min_df': 40,
  'svm__C': 50.0,
  'svm__kernel': 'rbf'},
 0.9666909208505133)

In [73]:
# Extract best model from grid search
best_model = grid_search.best_estimator_
best_train_predictions = best_model.predict(x_train)
modelStats(y_train, best_train_predictions)

(0.9926928863099076, 0.9759825327510917, 0.9992548435171386)

In [88]:
# Evaluate test set
test_predictions = best_model.predict(x_test)
modelStats(y_test, test_predictions)

(0.9536679536679536, 0.8982035928143712, 0.9554140127388535)

Logistic Regression

In [82]:
# Create Logistic Regression model with built in cross validation
lr_model = linear_model.LogisticRegressionCV(cv=5, max_iter=500)
lr_model.fit(x_train_counts, y_train)
lr_train_predict = lr_model.predict(x_train_counts)
modelStats(y_train, lr_train_predict)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.9860305179454115, 0.9577060931899641, 0.9955290611028316)

In [85]:
# Evaluate the test set
x_test_counts = cv.transform(x_test)
lr_test_predict = lr_model.predict(x_test_counts)
modelStats(y_test, lr_test_predict)

(0.9517374517374517, 0.888235294117647, 0.9617834394904459)