# Importing packages

In [125]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import collections as c
import os
import re
import numpy as np
import scipy.spatial.distance as sp

ps = PorterStemmer() 
sno = SnowballStemmer("english")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


# Get list of file names

In [2]:
filenames = []
for fn in os.listdir("enron1\\ham"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\ham", fn.replace("\\", "/")))
        
ham_num = len(filenames)

for fn in os.listdir("enron1\\spam"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\spam", fn.replace("\\", "/")))
    
spam_num = len(filenames) - ham_num

In [3]:
len(filenames)

5172

# Stem email text

In [121]:
#flag to set initial df
first = True
stems_list = []
#iterate through each file
for fn in filenames:  
    #read file
    with open(fn, 'r', encoding="ISO-8859-1") as file:
        
        #remove numbers
        data = re.sub('[0-9]', '', file.read().replace('\n', ' '))
        
        #data = file.read().replace('\n', ' ')
        
        #tokenize + word-stem
        nltk_tokens = word_tokenize(data)
        stems = ""
        for w in nltk_tokens:
            stems += ps.stem(w) + " "
        
        stems_list.append(stems)


# Vectorize counts and get dictionary

In [122]:
#count vectorize
vect = CountVectorizer(input="content")
temp = vect.fit_transform(stems_list)
word_bag = vect.get_feature_names()



In [123]:
df = pd.DataFrame(temp.toarray(), columns=word_bag)
df

Unnamed: 0,aa,aaa,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiab,aaigrcrb,...,zynv,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
#dropping stop words
nonstop_attr = df.columns.copy()
for word in stopwords.words('english'):
    if word in df.columns:
        nonstop_attr = nonstop_attr.drop(word)

In [141]:
len(df.columns)-len(nonstop_attr)

119

# Divide data into ham, spam, test, and train

In [150]:
df_nonstop = df[nonstop_attr].copy()[nonstop_attr]

ham_data = df_nonstop[:ham_num]
spam_data = df_nonstop[ham_num:]
ham_data["SPAM_LABEL"] = 0
spam_data["SPAM_LABEL"] = 1

msk = np.random.rand(len(ham_data)) < 0.7
ham_train = ham_data[msk]
ham_test = ham_data[~msk]

msk = np.random.rand(len(spam_data)) < 0.7
spam_train = spam_data[msk]
spam_test = spam_data[~msk]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [151]:
train_data = pd.concat([ham_train, spam_train])
test_data = pd.concat([ham_test, spam_test])

In [152]:
train_data

Unnamed: 0,aa,aaa,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiab,aaigrcrb,...,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt,SPAM_LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# K Nearest Neighbors

This is old code which is pretty slow. We'll keep it for memory's sake.


In [39]:
#get data
# attr = train_data.columns.drop("SPAM_LABEL")
# train_data_knn = train_data.copy()
# test_data_knn = test_data.copy()
# test_data_knn["y_1"] = -1
# test_data_knn["y_10"] = -1
# test_data_knn["y_50"] = -1
# test_data_knn["y_200"] = -1
# test_data_knn["y_1000"] = -1

# for i in range(len(test_data_knn.index)):
#     temp = train_data_knn[attr].subtract(test_data_knn.iloc[i][attr])
#     train_data_knn["L2_NORM"] = temp.apply(np.linalg.norm, axis=1)
#     train_data_knn = train_data_knn.sort_values(by="L2_NORM", ascending=True)
#     test_data_knn["y_1"].iloc[i] = train_data_knn[:1]["SPAM_LABEL"].value_counts().idxmax()
#     test_data_knn["y_10"].iloc[i] = train_data_knn[:10]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_50"].iloc[i] = train_data_knn[:50]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_200"].iloc[i] = train_data_knn[:200]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_1000"].iloc[i] = train_data_knn[:1000]["two_year_recid"].value_counts().idxmax()
    
    

Here's code that runs in 5-10 min. For some reason the accuracy is lower than above even though computation should be identical... could be variation from random seed splitting training and test.

In [153]:
#setting parameters
k = 5
attr = train_data.columns.drop("SPAM_LABEL")

#copying data
train_data_knn = train_data.copy()
test_data_knn = test_data.copy()

#get neighbors
print("Getting norms...")
norms = sp.cdist(test_data_knn, train_data_knn)
print("Getting neighbors...")
neighbors = np.apply_along_axis(np.argpartition, 0, norms, k)[:,:k]


#classifying
print("Classifying...")
test_data_knn["SPAM_LABEL_y"] = -1
for i in range(len(test_data_knn.index)):
    test_data_knn["SPAM_LABEL_y"] = train_data_knn.iloc[list(neighbors[i]), :]["SPAM_LABEL"].mode()[0]

Getting norms...
Getting neighbors...
Classifying...


In [154]:
len(test_data_knn[test_data_knn["SPAM_LABEL_y"] == test_data_knn["SPAM_LABEL"]].index)/len(test_data_knn.index)

0.7032173342087984

In [88]:
#get accuracy
len(test_data_knn[test_data_knn["SPAM_LABEL_y"] == test_data_knn["SPAM_LABEL"]].index)/len(test_data_knn.index)

0.6899736147757256

# Naive Bayes

In [None]:
def bayes_prob(row, train, attr):
    p = 1
    for i in range(len(attr)):
        if row[i+2] in train.index:
            p = p * train.loc[row[i+2]][attr[i]]
        else:
            return 0
    return p


#split data 
train_data_b0 = train_data[train_data["SPAM_LABEL"]==0].apply(pd.Series.value_counts).fillna(0)
train_data_b1 = train_data[train_data["SPAM_LABEL"]==1].apply(pd.Series.value_counts).fillna(0)

#get counts and adjust for proportion
counts = train_data["SPAM_LABEL"].value_counts()
train_data_b0 = train_data_b0/counts[0]
train_data_b1 = train_data_b1/counts[1]
counts = counts/counts.sum()

#get test data
test_data_bayes = test_data.copy()
test_data_bayes["SPAM_LABEL_y"] = -1

#classification
i = 0
for row in test_data.itertuples():
    p_0 = bayes_prob(row, train_data_b0, attr) * counts[0]
    p_1 = bayes_prob(row, train_data_b1, attr) * counts[1]

    #give label
    if p_0 > p_1:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 0
    else:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 1
        
    print(i)
        



In [None]:
def bayes_prob(row, train, attr):
    p = 1
    for i in range(len(attr)):
        if row[i+2] in train.index:
            p = p * train.loc[row[i+2]][attr[i]]
        else:
            return 0
    return p


#split data 
train_data_b0 = train_data[train_data["SPAM_LABEL"]==0].apply(pd.Series.value_counts).fillna(0)
train_data_b1 = train_data[train_data["SPAM_LABEL"]==1].apply(pd.Series.value_counts).fillna(0)

#get counts and adjust for proportion
counts = train_data["SPAM_LABEL"].value_counts()
train_data_b0 = train_data_b0/counts[0]
train_data_b1 = train_data_b1/counts[1]
counts = counts/counts.sum()

#get test data
test_data_bayes = test_data.copy()
test_data_bayes["SPAM_LABEL_y"] = -1

#classification
i = 0
for row in test_data.itertuples():
    p_0 = bayes_prob(row, train_data_b0, attr) * counts[0]
    p_1 = bayes_prob(row, train_data_b1, attr) * counts[1]

    #give label
    if p_0 > p_1:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 0
    else:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 1
        
    print(i)
        



In [98]:
train_data_b = train_data.copy()
freq = [c.Counter(train_data_b[col]) for col in attr]

In [156]:
#get counts and adjust for proportion
test_data_bayes = test_data.copy()
test_data_bayes["SPAM_LABEL_y"] = -1

train_data_b0 = train_data[train_data["SPAM_LABEL"]==0]
train_data_b1 = train_data[train_data["SPAM_LABEL"]==1]
counts = train_data["SPAM_LABEL"].value_counts()
total = counts.sum()
# train_data_b0 = train_data_b0/counts[0]
# train_data_b1 = train_data_b1/counts[1]
# counts = counts/counts.sum()

In [162]:
test_data_bayes

Unnamed: 0,aa,aaa,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiab,aaigrcrb,...,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt,SPAM_LABEL,SPAM_LABEL_y
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [174]:
def bayes_prob(row, train, attr, total):
    p = 1
    for i in range(len(attr)):
        if row[i+1] in freq[i].keys():
            p = p * freq[i][0]/total
        else:
            return 0
    return p

#classification
for row in test_data.itertuples():
    p_0 = bayes_prob(row, train_data_b0, attr, counts[0]) * counts[0]/total
    p_1 = bayes_prob(row, train_data_b1, attr, counts[1]) * counts[1]/total
    print(p_0)
    print(p_1)
    #give label
    if p_0 > p_1:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 0
    else:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 1

  """


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


KeyboardInterrupt: 

In [177]:
0 in freq[i].keys()

True

In [171]:
for row in test_data.iloc[0].itertuples():
    print(row)

AttributeError: 'Series' object has no attribute 'itertuples'