In [157]:
import numpy as np
import pandas as pd
import os
import glob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Spam Detection Filter

Let us illustrate a few concepts from before on a case study -- building a spam detection filter. The data for this notebook is from https://medium.com/analytics-vidhya/building-a-spam-filter-from-scratch-using-machine-learning-fc58b178ea56 

## Read Data

Here we will read the data and place them to Pandas dataframes

In [64]:
%ls data/email | head

[0m[01;34mnonspam-test[0m/
[01;34mnonspam-train[0m/
[01;34mspam-test[0m/
[01;34mspam-train[0m/


In [68]:
base_dir = "./data/email"
train_spam, train_legit = "./data/email/spam-train/*", "./data/email/nonspam-train/*"
test_spam,test_legit = "./data/email/spam-test/*", "./data/email/nonspam-test/*"
file_paths = glob.glob('./data/email/spam-train/*')
file_paths[:5]

['./data/email/spam-train/spmsga95.txt',
 './data/email/spam-train/spmsgc4.txt',
 './data/email/spam-train/spmsgc108.txt',
 './data/email/spam-train/spmsga32.txt',
 './data/email/spam-train/spmsgc50.txt']

In [75]:
def read_emails(dir_):
     return [' '.join(open(file_path, "r").readlines()).strip() for file_path in glob.glob(dir_)]

def read_df(spam_dir, legit_dir):
    # spam
    df_spam = pd.DataFrame({'email':read_emails(spam_dir)})
    df_spam['spam'] = 1
    # legit
    df_legit = pd.DataFrame({'email':read_emails(legit_dir)})
    df_legit['spam'] = 0
    return pd.concat([df_spam,df_legit])

df_train = read_df(train_spam, train_legit)
df_test = read_df(test_spam, test_legit)
df_train.shape, df_test.shape

((700, 2), (260, 2))

In [77]:
#df_train.to_csv('./data/email/train.csv')
#df_test.to_csv('./data/email/test.csv')

## TfidfVectorizer

Apply TfidfVectorizer to convert email texts into word vectors. There are multiple options to improve the model (TODO).

In [81]:
tf = TfidfVectorizer()
X_train = tf.fit_transform(df_train['email'])
y_train = df_train['spam']
X_test = tf.transform(df_test['email'])
y_test = df_test['spam']

In [84]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((700, 19073), (700,), (260, 19073), (260,))

## Principal Component Analysis

We will first apply principal component analysis to see how many components are needed:

In [91]:
X_train.todense() # we have to convert the data to dense matrix

matrix([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [93]:
p = PCA()

p.fit(X_train.todense())

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

How many components are needed to explain 95% of the variance?

In [100]:
n_comps = np.argmax(np.cumsum(p.explained_variance_) > 0.95)
n_comps

598

In [102]:
p = PCA(n_components = n_comps)
X_train_pca = p.fit_transform(X_train.todense())
X_test_pca = p.transform(X_test.todense())

## Modeling
### Logistic Regression

In [128]:
lr = LogisticRegression(random_state = 1)
lr.fit(X_train_pca, y_train)
y_pred = lr.predict(X_test_pca)



In [129]:
print(classification_report(y_test,y_pred, target_names = ['legit','spam']))

              precision    recall  f1-score   support

       legit       0.97      0.99      0.98       130
        spam       0.99      0.97      0.98       130

   micro avg       0.98      0.98      0.98       260
   macro avg       0.98      0.98      0.98       260
weighted avg       0.98      0.98      0.98       260



### Artificial Neural Network

In [None]:
nn = MLPClassifier(hidden_layer_sizes = [100,100,10]
                   ,learning_rate_init = 0.0002
                   ,max_iter = 1000
                   ,verbose = 1, random_state = 2)
nn.fit(X_train_pca, y_train)
y_pred = nn.predict(X_test_pca)

In [142]:
print(classification_report(y_test,y_pred, target_names = ['legit','spam']))

              precision    recall  f1-score   support

       legit       0.99      0.99      0.99       130
        spam       0.99      0.99      0.99       130

   micro avg       0.99      0.99      0.99       260
   macro avg       0.99      0.99      0.99       260
weighted avg       0.99      0.99      0.99       260



## Experiment with the model

Let us put all the steps into a pipeline:
* TfidFVectorizer to convert the text to word counts;
* Transform the sparse matrix to dense;
* Apply principal component analysis;
* Run artificial neural network to predict if the email is spam;

In [154]:
model = Pipeline([('tfidfvectorizer',tf)
                 ,('todense',FunctionTransformer(lambda x: x.todense(), accept_sparse=True, validate = False))
                 ,('pca',p)
                 ,('ann',nn)])

The method returns two numbers which can be thought of 'probabilities' that the email is legit (1st) vs. spam (2nd)

In [158]:
model.predict_proba(['Hello'])

array([[ 0.02945142,  0.97054858]])

Type a sample email below and see the spam confidence changes:

In [164]:
interact(lambda x:print("Spam: %.5f"%model.predict_proba([x])[0][1])
         , x='Hi there!')

interactive(children=(Text(value='Hi there!', description='x'), Output()), _dom_classes=('widget-interact',))

<function __main__.<lambda>>