In [7]:
import sys

dependencies = ["nltk", "numpy", "pandas", "scipy", "sklearn", "pickle", "re"]

for module in dependencies:
    print("\nChecking for " + module + "...")
    try:
        # Import module from string variable:
        module_obj = __import__(module)

        # To contain the module, create a global object using globals()
        globals()[module] = module_obj
    except ImportError:
        print("Install " + module + " before continuing")
        print("In a terminal type the following commands:")
        print("python get-pip.py")
        print("pip install " + module + "\n")
        sys.exit(1)

print("\nSystem is ready!")

import urllib.request
import os
import tarfile
import pickle
import pandas as pd

import os.path

if os.path.isfile('trained_model.pickle'):
    print ("Model File exist")
else:
    createfile = open('trained_model.pickle', 'w+')
    
if os.path.isfile('vectorizer.pickle'):
    print ("Vectorizer File exist")
else:
    createfile = open('vectorizer.pickle', 'w+')

print("Downloading Enron emails in the Downloads folder...")

# Get the user's Downloads folder path
downloads = os.path.join(os.path.expanduser('~') + "/Downloads")

url = "http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/"

enron_dir = os.path.join(downloads, 'Enron emails')

enron_files = ['enron1.tar.gz', 'enron2.tar.gz', 'enron3.tar.gz',
               'enron4.tar.gz', 'enron5.tar.gz', 'enron6.tar.gz']

def download():
    """ Download Enron emails if missing. """

    # Create the directories.
    if not os.path.exists(enron_dir):
        os.makedirs(enron_dir)
    # Download the files that not exist.
    for file in enron_files:
        path = os.path.join(enron_dir, file)
        if not os.path.exists(path):
            urllib.request.urlretrieve(url + file, path)

def extract_emails(fname):
    """ Extract the zipped emails and load them into a pandas df.
    Args:
        fname (str): the files with tar.gz extension
    Returns:
        pandas df: a pandas dataframe of emails
    """

    rows = []
    tfile = tarfile.open(fname, 'r:gz')
    for member in tfile.getmembers():
        if 'ham' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'ham'})
        if 'spam' in member.name:
            f = tfile.extractfile(member)
            if f is not None:
                row = f.read()
                rows.append({'message': row, 'class': 'spam'})
    tfile.close()
    return pd.DataFrame(rows)

def populate_df_and_pickle():
    """ Populate the df with all the emails and save it to a pickle object. """

    if not os.path.exists(downloads + "/emails.pickle"):
        emails_df = pd.DataFrame({'message': [], 'class': []})
        for file in enron_files:
            unzipped_file = extract_emails(os.path.join(enron_dir, file))
            emails_df = emails_df.append(unzipped_file)
        emails_df.to_pickle(downloads + "/emails.pickle")

if __name__ == '__main__':
    download()
    populate_df_and_pickle()
    print("Download, unzip, and save to pickle done!")


print(downloads)

with open(downloads + '/emails.pickle', 'rb') as f:
    emails_df = pickle.load(f) 

# Translate bytes objects into strings.
emails_df['message'] = emails_df['message'].apply(lambda x: x.decode('latin-1'))

# Reset pandas df index.
emails_df = emails_df.reset_index(drop=True)

# Map 'spam' to 1 and 'ham' to 0.
emails_df['class'] = emails_df['class'].map({'spam':1, 'ham':0})

print(emails_df.index)
emails_df.shape

emails_df.iloc[25000].values

from string import punctuation
import re

def clean_email(email):
    """ Remove all punctuation, urls, numbers, and newlines.
    Convert to lower case.
    Args:
        email (unicode): the email
    Returns:
        email (unicode): only the text of the email
    """

    email = re.sub(r'http\S+', ' ', email)
    email = re.sub("\d+", " ", email)
    email = email.replace('\n', ' ')
    email = email.translate(str.maketrans("", "", punctuation))
    email = email.lower()
    return email

emails_df['message'] = emails_df['message'].apply(clean_email)

emails_df.iloc[25000].values

from nltk.stem.snowball import SnowballStemmer
# nltk.download('wordnet') # uncomment to download 'wordnet'
from nltk.corpus import wordnet as wn

def preproces_text(email):
    """ Split the text string into individual words, stem each word,
    and append the stemmed word to words. Make sure there's a single
    space between each stemmed word.
    Args:
        email (unicode): the email
    Returns:
        words (unicode): the text of the email
    """

    words = ""
    # Create the stemmer.
    stemmer = SnowballStemmer("english")
    # Split text into words.
    email = email.split()
    for word in email:
        # Optional: remove unknown words.
        # if wn.synsets(word):
        words = words + stemmer.stem(word) + " "

    return words

emails_df['message'] = emails_df['message'].apply(preproces_text)

emails_df.iloc[25000].values

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Define the independent variables as Xs.
Xs = emails_df['message'].values

# Define the target (dependent) variable as Ys.
Ys = emails_df['class'].values

# Vectorize words - Turn the text numerical feature vectors,
# using the strategy of tokenization, counting and normalization.
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                       stop_words='english')

Xs = vectorizer.fit_transform(Xs)

# Create a train/test split using 20% test size.
X_train, X_test, y_train, y_test = train_test_split(Xs,
                                                    Ys,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=0,
                                                    stratify=Ys)

feature_names = vectorizer.get_feature_names()
print("Number of different words: {0}".format(len(feature_names)))
print("Word example: {0}".format(feature_names[5369]))

# Check the split printing the shape of each set.
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

from sklearn.naive_bayes import MultinomialNB

# Create classifier.
clf = MultinomialNB()

# Fit the classifier on the training features and labels.
clf.fit(X_train, y_train)

import pickle
with open("trained_model.pickle", "wb") as file:
    pickle.dump(clf, file)
with open("vectorizer.pickle", "wb") as file2:
    pickle.dump(vectorizer, file2)

# Calculate the accuracy on the test data.
print("Accuracy: {}".format(clf.score(X_test, y_test)))



Checking for nltk...

Checking for numpy...

Checking for pandas...

Checking for scipy...

Checking for sklearn...

Checking for pickle...

Checking for re...

System is ready!
Model File exist
Vectorizer File exist
Downloading Enron emails in the Downloads folder...
Download, unzip, and save to pickle done!
C:\Users\User/Downloads
RangeIndex(start=0, stop=33716, step=1)
Number of different words: 119405
Word example: arcadian
(26972, 119405) (26972,)
(6744, 119405) (6744,)
Accuracy: 0.9847271648873073


