### Explore how to build a spam detector

In [1]:
from __future__ import division, print_function, unicode_literals
import numpy as np
import pandas as pd
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12


## Load data

In [2]:
from sys import platform
if platform == 'linux':
    DATA_PATH = os.path.join("/home/alin/handson-ml/datasets", "spamham")
else:
    DATA_PATH = os.path.join('C:/Users/alin/Documents/Kaggle/Titanic', 'Data')

In [3]:
%qtconsole

In [4]:
EASY_HAM_PATH = os.path.join(DATA_PATH, 'easy_ham')
HARD_HAM_PATH = os.path.join(DATA_PATH, 'hard_ham')
SPAM_PATH = os.path.join(DATA_PATH, 'spam')

In [5]:
def load_one_email(filepath):
    with open(filepath, 'r', encoding='latin-1') as fpr:
        return fpr.read()

In [6]:
spam_data = {'ID':[], 'Email':[], 'Label':[]}

In [7]:
for filename in  os.listdir(EASY_HAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(0)
    spam_data['Email'].append(load_one_email(os.path.join(EASY_HAM_PATH, filename)))
   

In [8]:
for filename in  os.listdir(HARD_HAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(0)
    spam_data['Email'].append(load_one_email(os.path.join(HARD_HAM_PATH, filename)))

In [9]:
for filename in  os.listdir(SPAM_PATH):
    spam_data['ID'].append(filename)
    spam_data['Label'].append(1)
    spam_data['Email'].append(load_one_email(os.path.join(SPAM_PATH, filename)))

In [10]:
spam_df = pd.DataFrame(spam_data)

In [11]:
X = spam_df['Email']
y = spam_df['Label']

In [12]:
shuffle_index = np.random.permutation(y.shape[0])
X, y = X[shuffle_index], y[shuffle_index]

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
from sklearn.pipeline import Pipeline

In [16]:
import email

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin

In [103]:
def remove_header(x):
    payload = email.message_from_string(x).get_payload()
    # if payload is not a string, set it to ''
    clean_txt = payload if isinstance(payload, str) else ''
    return clean_txt

class HeaderRemover(BaseEstimator, TransformerMixin):
    def __init__(self, header_remove=True):
        self.header_remove = header_remove
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.header_remove:
            return X.apply(remove_header)
        else:
            return X

In [155]:
class UrlCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, url_clean=True):
        self.url_clean = url_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.url_clean:
            return X.str.replace('https?://\S*', 'URL')
        else:
            return X

In [None]:
class NumCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, num_clean=True):
        self.num_clean = num_clean
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        if self.num_clean:
            return X.str.replace('\d*\.?', 'URL')
        else:
            return X

In [157]:
def process_pipeline(header_remove=True, url_clean=True, lowercase=True, strip_accents = True,
                     binary=True):
    pipe_items = []
    if strip_accents:
        accents = 'unicode'
    else:
        accents = None
    spam_pipeline = Pipeline([
        ('header_remover', HeaderRemover(header_remove=header_remove)),
        ('url_clean', UrlCleaner(url_clean=url_clean)),
   #     ('vect', CountVectorizer(lowercase=lowercase, strip_accents=accents, binary=binary)),
    
    ])
   
    return spam_pipeline

In [158]:
pline = process_pipeline()

In [159]:
X_train1 = pline.fit_transform(X_train)

In [160]:
X_test1 = pline.transform(X_test)

In [22]:
vect = CountVectorizer(binary=False)
X_train2 = vect.fit_transform(X_train)
X_test2 = vect.transform(X_test)

In [63]:
from sklearn.pipeline import Pipeline

In [67]:
import email

In [69]:
b = email.message_from_string(a)

In [70]:
print(b)

Return-Path: <spamassassin-talk-admin@example.sourceforge.net>
Delivered-To: yyyy@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 67C3247C68
	for <jm@localhost>; Thu, 22 Aug 2002 10:46:29 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Thu, 22 Aug 2002 15:46:29 +0100 (IST)
Received: from usw-sf-list2.sourceforge.net (usw-sf-fw2.sourceforge.net
    [216.136.171.252]) by dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id
    g7MEknZ10703 for <jm-sa@jmason.org>; Thu, 22 Aug 2002 15:46:49 +0100
Received: from usw-sf-list1-b.sourceforge.net ([10.3.1.13]
    helo=usw-sf-list1.sourceforge.net) by usw-sf-list2.sourceforge.net with
    esmtp (Exim 3.31-VA-mm2 #1 (Debian)) id 17htDp-0003Z5-00; Thu,
    22 Aug 2002 07:46:05 -0700
Received: from neo.pittstate.edu ([198.248.208.13]) by
    usw-sf-list1.sourceforge.net with esmtp (Exim 3.31-VA-mm2 #