In [21]:
import os
import read_email_data as df

In [22]:
DATA_DIR = 'trec07p/data/'
LABELS_FILE = 'trec07p/full/index'
TRAINING_SET_RATIO = 0.7

In [23]:
labels = {}
# Read the labels
with open(LABELS_FILE) as f:
    for line in f:
        line = line.strip()
        label, key = line.split()
        labels[key.split('/')[-1]] = 1 if label.lower() == 'ham' else 0

In [29]:
def read_email_files():
    X = []
    y = []
    for i in range(len(labels)):
        filename = 'inmail.' + str(i + 1)
        email_str = df.extract_email_text(
            os.path.join(DATA_DIR, filename))
        X.append(email_str)
        y.append(labels[filename])
    return X, y

In [30]:
X, y = read_email_files()

In [37]:
# Split dataset into training and testing subsets 'randomly'
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, idx_train, idx_test = \
    train_test_split(X, y, range(len(y)),
    train_size = TRAINING_SET_RATIO, random_state= 2)

In [40]:
# convert each email to a vector representation that MultinominalNB accept as input
# one of simplest way is to use the bag-of-word representation
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

In [42]:
# Train and test Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Initialize the classifier and make label prediction
Mnb = MultinomialNB()
Mnb.fit(X_train_vector, y_train)
y_pred = Mnb.predict(X_test_vector)

In [43]:
# Print results
print(classification_report(y_test, y_pred, target_names= ['Spam', 'Ham']))

# Print accuracy score
print('Classification accuracy {:.1%}'.format(accuracy_score(y_test, y_pred)))

              precision    recall  f1-score   support

        Spam       0.99      0.94      0.97     15035
         Ham       0.90      0.98      0.94      7591

   micro avg       0.96      0.96      0.96     22626
   macro avg       0.94      0.96      0.95     22626
weighted avg       0.96      0.96      0.96     22626

Classification accuracy 95.5%
