### Explore how to build a spam detector

In [4]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import pandas as pd
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12


## Load data

In [5]:
import sys

from sys import platform
if platform == 'linux':
    DATA_PATH = os.path.join("/home/alin/handson-ml/datasets", "spamham")
else:
    DATA_PATH = os.path.join('C:/Users/alin/Documents/SelfStudy/datasets', 'spamham')

In [6]:
%qtconsole

In [7]:
EASY_HAM_PATH = os.path.join(DATA_PATH, 'easy_ham')
HARD_HAM_PATH = os.path.join(DATA_PATH, 'hard_ham')
SPAM_PATH = os.path.join(DATA_PATH, 'spam')

In [8]:
def load_one_email(filepath):
    with open(filepath, 'r', encoding='latin-1') as fpr:
            return fpr.read()

In [89]:
spam_data = {'ID':[], 'Email':[], 'Label':[]}

for filename in  os.listdir(EASY_HAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(0)
    spam_data['Email'].append(load_one_email(os.path.join(EASY_HAM_PATH, filename)))
   

for filename in  os.listdir(HARD_HAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(0)
    spam_data['Email'].append(load_one_email(os.path.join(HARD_HAM_PATH, filename)))

for filename in  os.listdir(SPAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(1)
    spam_data['Email'].append(load_one_email(os.path.join(SPAM_PATH, filename)))

In [13]:
spam_df = pd.DataFrame(spam_data)

X = spam_df['Email']
y = spam_df['Label']

In [15]:
shuffle_index = np.random.permutation(y.shape[0])
X, y = X[shuffle_index], y[shuffle_index]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import Pipeline

import email

from sklearn.base import BaseEstimator, TransformerMixin

In [21]:
def remove_header(x):
    payload = email.message_from_string(x).get_payload()
    # if payload is not a string, set it to ''
    clean_txt = payload if isinstance(payload, str) else ''
    return clean_txt

class HeaderRemover(BaseEstimator, TransformerMixin):
    def __init__(self, header_remove=True):
        self.header_remove = header_remove
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.header_remove:
            return X.apply(remove_header)
        else:
            return X

In [22]:
class UrlCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, url_clean=True):
        self.url_clean = url_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.url_clean:
            return X.str.replace('https?://\S*', 'URL')
        else:
            return X

In [65]:
class NumCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, num_clean=True):
        self.num_clean = num_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.num_clean:
            return X.str.replace('[-\+\d\.,]+', ' NUM ')
        else:
            return X

In [71]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


In [83]:
def process_pipeline(header_remove=True, url_clean=True, num_clean=True, stem=False, stop_words = 'english', lowercase=True, strip_accents = True,
                     binary=True):
    pipe_items = []
    if strip_accents:
        accents = 'unicode'
    else:
        accents = None
    if stem:
        analyzer = stemmed_words
    else:
        analyzer = 'word'
    spam_pipeline = Pipeline([
        ('header_remover', HeaderRemover(header_remove=header_remove)),
        ('url_clean', UrlCleaner(url_clean=url_clean)),
        ('num_clean', NumCleaner(num_clean=num_clean)),
        ('vect', CountVectorizer(lowercase=lowercase, strip_accents=accents, analyzer = analyzer, stop_words=stop_words, binary=binary)),
    
    ])
   
    return spam_pipeline

In [84]:
pline = process_pipeline(stem=True)

In [85]:
X_train1 = pline.fit_transform(X_train)

In [87]:
pline1 = process_pipeline(stem=False)
X_train2 = pline1.fit_transform(X_train)