In [32]:
import io
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt

In [23]:
def readData(path):
    for root, folder, files in os.walk(path):
        for filename in files:
            newPath = root +"/"+filename
            file = open(newPath, encoding='latin1')
            lines = file.readlines()
            content = []
            inBody = False
            for line in lines:
                if line == '\n':
                    inBody = True
                if inBody:
                    content.append(line)
            file.close()
            msg = "\n".join(content)
            yield msg

In [24]:
def df(path,classification):
    dataset = []
    for msg in readData(path):
        row = {"message":msg,"class":classification}
        dataset.append(row)
    return pd.DataFrame(dataset)

In [25]:
dataset = pd.DataFrame({"message":[],"class":[]})
dataset = dataset.append(df('emails/ham/','ham'))
dataset = dataset.append(df('emails/spam/','spam'))

In [26]:
dataset.head()

Unnamed: 0,class,message
0,ham,"\n\n Date: Wed, 21 Aug 2002 10:54:46..."
1,ham,"\n\nMartin A posted:\n\nTassos Papadopoulos, t..."
2,ham,\n\nMan Threatens Explosion In Moscow \n\n\n\n...
3,ham,\n\nKlez: The Virus That Won't Die\n\n \n\nAlr...
4,ham,"\n\n> in adding cream to spaghetti carbonara,..."


In [27]:
dataset.shape

(3000, 2)

In [29]:
X = dataset.iloc[:,1].values
y = dataset.iloc[:,0].values

In [30]:
set(y)

{'ham', 'spam'}

In [33]:
lb = LabelEncoder()

In [34]:
y = lb.fit_transform(y)

In [35]:
set(y)

{0, 1}

In [36]:
X[0]

'\n\n    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n\n\n\n  | I can\'t reproduce this error.\n\n\n\nFor me it is very repeatable... (like every time, without fail).\n\n\n\nThis is the debug log of the pick happening ...\n\n\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n\n18:19:04 Marking 1 hits\n\n18:19:04 tkerror: syntax error in expression "int ...\n\n\n\nNote, if I run the pick command by hand ...\n\n\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n\n1 hit\n\n\n\nThat\'s where the "1 hit" comes from (obviously).  The version of nmh I\'m\n\nusing is ...\n\n\n\ndelta$ p

In [37]:
tokens = []
for i in range(len(X)):
    tokens.append(word_tokenize(X[i].lower()))

In [39]:
# print(tokens[0])

In [33]:
eng_stopwords = stopwords.words('english')

In [43]:
eng_stopwords.extend([',','.',"'",':','_','}','{',')',']','>','<','+','...','|','$','@','!','-','[','(','/','\''])

In [44]:
words = []
for list_1 in tokens:
    main_words = []
    for word in list_1:
        if word not in eng_stopwords:
            main_words.append(word)
    words.append(main_words)

In [45]:
print(words[0])

['date', 'wed', '21', 'aug', '2002', '10:54:46', '-0500', 'chris', 'garrigues', 'cwg-dated-1030377287.06fa6d', 'deepeddy.com', 'message-id', '1029945287.4797.tmda', 'deepeddy.vircio.com', 'ca', "n't", 'reproduce', 'error', 'repeatable', 'like', 'every', 'time', 'without', 'fail', 'debug', 'log', 'pick', 'happening', '18:19:03', 'pick_it', 'exec', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '18:19:03', 'exec', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '18:19:04', 'ftoc_pickmsgs', '1', 'hit', '18:19:04', 'marking', '1', 'hits', '18:19:04', 'tkerror', 'syntax', 'error', 'expression', '``', 'int', 'note', 'run', 'pick', 'command', 'hand', 'delta', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '1', 'hit', "'s", '``', '1', 'hit', "''", 'comes', 'o

In [46]:
wnet = WordNetLemmatizer()

In [47]:
for i in range(len(words)):
    for j in range(len(words[i])):
        lemm = wnet.lemmatize(words[i][j], pos='v')
        words[i][j] = lemm

In [49]:
print(words[0])

['date', 'wed', '21', 'aug', '2002', '10:54:46', '-0500', 'chris', 'garrigues', 'cwg-dated-1030377287.06fa6d', 'deepeddy.com', 'message-id', '1029945287.4797.tmda', 'deepeddy.vircio.com', 'ca', "n't", 'reproduce', 'error', 'repeatable', 'like', 'every', 'time', 'without', 'fail', 'debug', 'log', 'pick', 'happen', '18:19:03', 'pick_it', 'exec', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '18:19:03', 'exec', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '18:19:04', 'ftoc_pickmsgs', '1', 'hit', '18:19:04', 'mark', '1', 'hit', '18:19:04', 'tkerror', 'syntax', 'error', 'expression', '``', 'int', 'note', 'run', 'pick', 'command', 'hand', 'delta', 'pick', '+inbox', '-list', '-lbrace', '-lbrace', '-subject', 'ftp', '-rbrace', '-rbrace', '4852-4852', '-sequence', 'mercury', '1', 'hit', "'s", '``', '1', 'hit', "''", 'come', 'obviously

In [50]:
for i in range(len(words)):
    words[i] = ' '.join(words[i])

In [53]:
tfidf = TfidfVectorizer()

In [54]:
vect = tfidf.fit_transform(words)

In [89]:
newX = vect.toarray()

In [56]:
x_train,x_test,y_train,y_test = train_test_split(vect,y,test_size=0.25)

In [57]:
x_train.shape, y_train.shape

((2250, 59273), (2250,))

# Logistic Regression

In [58]:
reg = LogisticRegression()

In [59]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [60]:
y_pred = reg.predict(x_test)

In [61]:
accuracy_score(y_test, y_pred)

0.948

In [64]:
confusion_matrix(y_test, y_pred)

array([[633,   0],
       [ 39,  78]], dtype=int64)

In [78]:
def pred(rev):
    words = []
    tokens = []
    text = [rev]
    for i in range(len(text)):
        tokens.append(word_tokenize(text[i].lower()))
    for j in tokens:
        main_words = []
        for k in j:
            if k not in eng_stopwords:
                main_words.append(k)
        words.append(main_words)
    for l in range(len(words)):
        for m in range(len(words[l])):
            lemm = wnet.lemmatize(words[l][m], pos='v')
            words[l][m] = lemm
    for n in range(len(words)):
        words[n] = ' '.join(words[n])
    vect = tfidf.transform(words)
    w_arr = vect.toarray()
    pred = reg.predict(w_arr)
    if pred == [0]:
        print("ham")
    else:
        print("spam")

In [84]:
rev = X[345]

In [85]:
pred(rev)

ham
