### http://www2.aueb.gr/users/ion/data/enron-spam/

In [1]:
import pandas as pd
import numpy as np
import os

NEWLINE = '\n'

def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            if os.path.isfile(file_path):
                lines = []
                f = open(file_path, encoding="latin-1")
                for line in f:
                    lines.append(line)
                f.close()
                content = NEWLINE.join(lines)
                yield file_path, content

def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    df = pd.DataFrame(rows, index=index)
    return df

In [2]:
SOURCES = [
    ('data\mail\spam',   'SPAM'),
    ('data\mail\ham',    'HAM')
]

data = pd.DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))
data = data.reindex(np.random.permutation(data.index))
data.head(1)

Unnamed: 0,class,text
data\mail\ham\2561.2000-10-17.farmer.ham.txt,HAM,Subject: re : cornhusker\n\ntenaska iv has bee...


### feature extraction - word vectorizing

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)
counts

<5174x50474 sparse matrix of type '<class 'numpy.int64'>'
	with 456439 stored elements in Compressed Sparse Row format>

In [4]:
counts.data

array([1, 1, 1, ..., 1, 1, 3], dtype=int64)

### classification

In [5]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

examples = ['please call linda and get everything set up ', 'when we are young , but near the age of twenty - one our bodies begin to produce']
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['HAM', 'SPAM'], 
      dtype='<U4')

### pipeline

In [6]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  MultinomialNB()) ])

pipeline.fit(data['text'].values, data['class'].values)
pipeline.predict(examples)

array(['HAM', 'SPAM'], 
      dtype='<U4')