# Natural Language Processing

In [None]:
# Reference:
# https://github.com/ageron/handson-ml/blob/master/03_classification.ipynb
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import urllib
import tarfile
%matplotlib inline

**Download spam emails and ham emails**

In [None]:
def fetch_spam_data():
    """
    Download spam email datasets. Data are saved to 'Data/spam_emails/'.
    """
    # Set up URLs and filenames.
    DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
    HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
    SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
    LOCAL_DIRECTORY = "Data/spam_emails/"
    HAM_FILENAME = os.path.join(LOCAL_DIRECTORY, "ham.tar.bz2")
    SPAM_FILENAME = os.path.join(LOCAL_DIRECTORY, "spam.tar.bz2")
    
    # Download spam file and ham file.
    if not os.path.isdir(LOCAL_DIRECTORY):
        os.mkdir(LOCAL_DIRECTORY)
    if not os.path.isfile(HAM_FILENAME):
        urllib.request.urlretrieve(HAM_URL, HAM_FILENAME)
    if not os.path.isfile(SPAM_FILENAME):
        urllib.request.urlretrieve(SPAM_URL, SPAM_FILENAME)
    
    # Decompress the downloaded tar files.
    with tarfile.open(HAM_FILENAME, 'r') as file:
        file.extractall(LOCAL_DIRECTORY)
    with tarfile.open(SPAM_FILENAME, 'r') as file:
        file.extractall(LOCAL_DIRECTORY)
        
    # Print out all filenames.
    for dirname, _, filenames in os.walk(LOCAL_DIRECTORY):
        for filename in filenames:
            print(os.path.join(dirname, filename))

In [None]:
fetch_spam_data()

**Store all the filenames in a list**

In [None]:
HAM_DIR = "Data/spam_emails/easy_ham/"
SPAM_DIR = "Data/spam_emails/spam/"
ham_filenames = [name for name in os.listdir(HAM_DIR) if name != "cmds"]
spam_filenames = [name for name in os.listdir(SPAM_DIR) if name != "cmds"]

In [None]:
print(len(ham_filenames))
print(len(spam_filenames))

In [None]:
ham_filenames

**Load emails**

In [None]:
import email
import email.policy

def load_email(is_spam, filename, data_path="Data/spam_emails/"):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(data_path, directory, filename), 'rb') as file:
        return email.parser.BytesParser(policy=email.policy.default).parse(file)

In [None]:
ham_emails = [load_email(is_spam=False, filename=name) 
              for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) 
              for name in spam_filenames]

In [None]:
email = ham_emails[3]

In [None]:
# ?email

In [None]:
print(email.get_content())

Some emails are actually multipart, with images and attachments (which can have their own attachments). Let's look at the various types of structures we have:

In [None]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    # get the main body of the email as payload
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [None]:
from collections import Counter
# A counter is a container that stores elements as dictionary keys, 
# and their counts are stored as dictionary values.
def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [None]:
ham_counter = structures_counter(ham_emails)

In [None]:
ham_counter.most_common()

In [None]:
structures_counter(spam_emails).most_common()

It seems that the ham emails are more often plain text, while spam has quite a lot of HTML. Moreover, quite a few ham emails are signed using PGP, while no spam is. In short, it seems that the email structure is useful information to have.

**Load email headers**

In [None]:
for header, value in spam_emails[4].items():
    print(header,":",value)

In [None]:
# Extract the subject header
spam_emails[0]["Subject"]

Before we learn too much about the data, let's not forget to split it into a training set and a test set. The test set is reserved to evaluate the classifier that we build from the training set. We are not supposed to use any information from the test set to build the classifier.

In [None]:
from sklearn.model_selection import train_test_split
X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Preprocess the text data**

**1. convert HTML to plain text**

In [None]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

# Flags:
# re.M: multiline
# re.S: dot matches all
# re.I: ignore case

In [None]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

This is the resulting plain text:

In [None]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

Now let's write a function that takes an email as input and returns its content as plain text, whatever its format is:

In [None]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [None]:
print(email_to_text(sample_html_spam)[:100], "...")

**2. Combine similar words together using the Natural Language Toolkit (NLTK)**

In [None]:
# install nltk
!pip install nltk

In [None]:
import nltk

stemmer = nltk.PorterStemmer()
# The following words have the same root (except the last one):
for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
    print(word, "=>", stemmer.stem(word))

**3. Extract URLs from emails**

In [None]:
!pip install urlextract

In [None]:
import urlextract

url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))

**Text Analysis: examine word frequencies in spam emails and ham emails**

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [None]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

**Covert word counts to vectors**

In [None]:
from scipy.sparse import csr_matrix # sparse matrix type

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [None]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

In [None]:
X_few_vectors.toarray()

In [None]:
vocab_transformer.vocabulary_

Now let's transform the whole dataset.

In [None]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

Apply a classification model called *logistic regression* to the training set

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="liblinear", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

Print out the precision and recall of the logistic model on the test set:

In [None]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="liblinear", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))