# Importing packages

In [1]:
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import os
import re
import numpy as np
import scipy.spatial.distance as sp

ps = PorterStemmer() 
sno = SnowballStemmer("english")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Get list of file names

In [2]:
filenames = []
for fn in os.listdir("enron1\\ham"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\ham", fn.replace("\\", "/")))
        
ham_num = len(filenames)

for fn in os.listdir("enron1\\spam"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\spam", fn.replace("\\", "/")))
    
spam_num = len(filenames) - ham_num

In [3]:
len(filenames)

5172

# Stem email text

In [3]:
#flag to set initial df
first = True
stems_list = []
#iterate through each file
for fn in filenames:  
    #read file
    with open(fn, 'r', encoding="ISO-8859-1") as file:
        data = file.read().replace('\n', ' ')
        
        #tokenize + word-stem
        nltk_tokens = word_tokenize(data)
        stems = ""
        for w in nltk_tokens:
            stems += ps.stem(w) + " "
        
        stems_list.append(stems)


# Vectorize counts and get dictionary

In [4]:
#count vectorize
vect = CountVectorizer(input="content")
temp = vect.fit_transform(stems_list)
word_bag = vect.get_feature_names()


In [5]:
df = pd.DataFrame(temp.toarray(), columns=word_bag)
df

Unnamed: 0,00,000,0000,000000,000000000002858,000000000049773,000080,000099,0001,00018,...,zynv,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Divide data into ham, spam, test, and train

In [6]:
ham_data = df[:ham_num]
spam_data = df[ham_num:]
ham_data["SPAM_LABEL"] = 0
spam_data["SPAM_LABEL"] = 1

msk = np.random.rand(len(ham_data)) < 0.7
ham_train = ham_data[msk]
ham_test = ham_data[~msk]

msk = np.random.rand(len(spam_data)) < 0.7
spam_train = spam_data[msk]
spam_test = spam_data[~msk]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [7]:
train_data = pd.concat([ham_train, spam_train])
test_data = pd.concat([ham_test, spam_test])

In [9]:
"L2_NORM" in word_bag

False

# Naive Bayes

In [27]:
k = 5
attr = train_data.columns.drop("SPAM_LABEL")

#copying data
train_data_knn = train_data[:10].copy()
test_data_knn = test_data[:10].copy()

#get neighbors
print("Getting norms...")
norms = sp.cdist(test_data_knn, train_data_knn)
print("Getting neighbors...")
neighbors = np.apply_along_axis(np.argpartition, 1, norms, k)[:,:k]


Getting norms...
Getting neighbors...


In [28]:
neighbors

array([[1, 4, 7, 6, 9],
       [6, 4, 8, 7, 1],
       [1, 8, 4, 6, 7],
       [8, 3, 2, 5, 6],
       [1, 8, 4, 6, 7],
       [6, 4, 8, 7, 1],
       [6, 4, 8, 7, 1],
       [1, 4, 7, 6, 9],
       [1, 4, 7, 6, 9],
       [1, 6, 7, 4, 8]], dtype=int64)

In [29]:
norms

array([[48.14561247,  4.58257569, 29.        , 24.69817807, 13.07669683,
        39.78693253, 16.76305461, 13.7113092 , 19.39071943, 17.17556404],
       [46.96807426, 14.73091986, 30.4466747 , 25.72936066, 18.24828759,
        41.43669871, 20.0748599 , 18.92088793, 20.29778313, 22.56102835],
       [54.10175598, 36.08323711, 43.01162634, 38.45776905, 36.22154055,
        51.90375709, 36.57868232, 36.9459064 , 34.71310992, 40.24922359],
       [57.32364259, 58.14636704, 49.82971001, 48.92851929, 56.92978131,
        51.48786265, 55.47071299, 56.76266379, 49.93996396, 60.40695324],
       [53.34791467, 37.96050579, 43.57751714, 39.34463115, 37.90778284,
        52.27810249, 38.22302971, 39.16631206, 36.95943723, 41.41255848],
       [47.78074926, 14.07124728, 30.85449724, 26.21068484, 17.54992877,
        41.1339276 , 19.84943324, 18.41195264, 20.02498439, 21.72556098],
       [48.15599651, 10.95445115, 28.98275349, 24.8394847 , 14.96662955,
        40.66939882, 17.88854382, 15.84297952

In [11]:
train_data_knn.iloc[list(neighbors[1]), :]["SPAM_LABEL"].mode()[0]

0