# Importing packages

In [231]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import collections as c
import os
import re
import numpy as np
import scipy.spatial.distance as sp
import math

ps = PorterStemmer() 
sno = SnowballStemmer("english")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\janep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Get list of file names

In [2]:
filenames = []
for fn in os.listdir("enron1\\ham"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\ham", fn.replace("\\", "/")))
        
ham_num = len(filenames)

for fn in os.listdir("enron1\\spam"):
    if fn.endswith(".txt"):
        filenames.append(os.path.join("enron1\\spam", fn.replace("\\", "/")))
    
spam_num = len(filenames) - ham_num

In [3]:
len(filenames)

5172

# Stem email text

In [121]:
#flag to set initial df
first = True
stems_list = []
#iterate through each file
for fn in filenames:  
    #read file
    with open(fn, 'r', encoding="ISO-8859-1") as file:
        
        #remove numbers
        data = re.sub('[0-9]', '', file.read().replace('\n', ' '))
        
        #data = file.read().replace('\n', ' ')
        
        #tokenize + word-stem
        nltk_tokens = word_tokenize(data)
        stems = ""
        for w in nltk_tokens:
            stems += ps.stem(w) + " "
        
        stems_list.append(stems)


# Vectorize counts and get dictionary

In [122]:
#count vectorize
vect = CountVectorizer(input="content")
temp = vect.fit_transform(stems_list)
word_bag = vect.get_feature_names()



In [123]:
df = pd.DataFrame(temp.toarray(), columns=word_bag)
df

Unnamed: 0,aa,aaa,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiab,aaigrcrb,...,zynv,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [140]:
#dropping stop words
nonstop_attr = df.columns.copy()
for word in stopwords.words('english'):
    if word in df.columns:
        nonstop_attr = nonstop_attr.drop(word)

In [141]:
len(df.columns)-len(nonstop_attr)

119

# Divide data into ham, spam, test, and train

In [150]:
df_nonstop = df[nonstop_attr].copy()[nonstop_attr]

ham_data = df_nonstop[:ham_num]
spam_data = df_nonstop[ham_num:]
ham_data["SPAM_LABEL"] = 0
spam_data["SPAM_LABEL"] = 1

msk = np.random.rand(len(ham_data)) < 0.7
ham_train = ham_data[msk]
ham_test = ham_data[~msk]

msk = np.random.rand(len(spam_data)) < 0.7
spam_train = spam_data[msk]
spam_test = spam_data[~msk]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [151]:
train_data = pd.concat([ham_train, spam_train])
test_data = pd.concat([ham_test, spam_test])

In [152]:
train_data

Unnamed: 0,aa,aaa,aabda,aabvmmq,aac,aachecar,aaer,aafco,aaiab,aaigrcrb,...,zyqtaqlt,zyrtec,zyyqywp,zzezrjok,zzn,zzo,zzocb,zzso,zzsyt,SPAM_LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# K Nearest Neighbors

This is old code which is pretty slow. We'll keep it for memory's sake.


In [39]:
#get data
# attr = train_data.columns.drop("SPAM_LABEL")
# train_data_knn = train_data.copy()
# test_data_knn = test_data.copy()
# test_data_knn["y_1"] = -1
# test_data_knn["y_10"] = -1
# test_data_knn["y_50"] = -1
# test_data_knn["y_200"] = -1
# test_data_knn["y_1000"] = -1

# for i in range(len(test_data_knn.index)):
#     temp = train_data_knn[attr].subtract(test_data_knn.iloc[i][attr])
#     train_data_knn["L2_NORM"] = temp.apply(np.linalg.norm, axis=1)
#     train_data_knn = train_data_knn.sort_values(by="L2_NORM", ascending=True)
#     test_data_knn["y_1"].iloc[i] = train_data_knn[:1]["SPAM_LABEL"].value_counts().idxmax()
#     test_data_knn["y_10"].iloc[i] = train_data_knn[:10]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_50"].iloc[i] = train_data_knn[:50]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_200"].iloc[i] = train_data_knn[:200]["two_year_recid"].value_counts().idxmax()
#     test_data_knn["y_1000"].iloc[i] = train_data_knn[:1000]["two_year_recid"].value_counts().idxmax()
    
    

Here's code that runs in 5-10 min. For some reason the accuracy is lower than above even though computation should be identical... could be variation from random seed splitting training and test.

In [153]:
#setting parameters
k = 5
attr = train_data.columns.drop("SPAM_LABEL")

#copying data
train_data_knn = train_data.copy()
test_data_knn = test_data.copy()

#get neighbors
print("Getting norms...")
norms = sp.cdist(test_data_knn, train_data_knn)
print("Getting neighbors...")
neighbors = np.apply_along_axis(np.argpartition, 0, norms, k)[:,:k]


#classifying
print("Classifying...")
test_data_knn["SPAM_LABEL_y"] = -1
for i in range(len(test_data_knn.index)):
    test_data_knn["SPAM_LABEL_y"] = train_data_knn.iloc[list(neighbors[i]), :]["SPAM_LABEL"].mode()[0]

Getting norms...
Getting neighbors...
Classifying...


In [154]:
len(test_data_knn[test_data_knn["SPAM_LABEL_y"] == test_data_knn["SPAM_LABEL"]].index)/len(test_data_knn.index)

0.7032173342087984

In [88]:
#get accuracy
len(test_data_knn[test_data_knn["SPAM_LABEL_y"] == test_data_knn["SPAM_LABEL"]].index)/len(test_data_knn.index)

0.6899736147757256

# Naive Bayes

In [232]:
def bayes_prob(row, freq, attr, total):
    p = 0
    for i in range(len(attr)):
        #using add-one laplacian smoothing
        p += math.log((freq[i][row[i+1]]+1)/total
        
    return p


In [195]:
#get training data
train_data_b0 = train_data[train_data["SPAM_LABEL"]==0]
train_data_b1 = train_data[train_data["SPAM_LABEL"]==1]

#build frequency table
freq_0 = [c.Counter(train_data_b0[col]) for col in attr]
freq_1 = [c.Counter(train_data_b1[col]) for col in attr]

#get counts
counts = train_data["SPAM_LABEL"].value_counts()
total = counts.sum()

#get test data
test_data_bayes = test_data.copy()
test_data_bayes["SPAM_LABEL_y"] = -1


In [240]:
#classiying
for row in test_data.itertuples():
    p_0 = bayes_prob(row, freq_0, attr, counts[0]) + math.log(counts[0]/total)
    p_1 = bayes_prob(row, freq_1, attr, counts[1]) + math.log(counts[1]/total)
    
#     print(row[0])
#     print(p_0)
#     print(p_1)
    
    #give label
    if p_0 > p_1:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 0
    else:
        test_data_bayes["SPAM_LABEL_y"].loc[row[0]] = 1

1
-422.5594713576798
-474.476687829454
12
-216.30937861342528
-297.6334948029499
15
-202.977719493091
-295.5300862901168
21
-310.151132749945
-369.46411406552716
27
-265.7086300772932
-321.16685639365227
28
-244.35136494029834
-308.7144045550304
29
-172.06653646174968
-213.94366912881355
31
-131.85346742837675
-182.61811473585337
34
-125.86897379616569
-153.29561428651525
35
-279.1049757917891
-341.48801408909316
38
-454.3632043854756
-516.1598345075766
39
-351.4719180940531
-406.08795750532977
42
-276.42110695848936
-291.5476473799129
43
-1070.3865264592503
-1116.8926715131556
49
-105.99787989523232
-168.32464492528345
61
-102.76358509929894
-117.41150888377805
63
-231.98671415018237
-298.08469032150896
76
-304.34218831219465
-317.431165716324
77
-243.53476323794723
-333.67411205990703
78
-79.09389291961232
-95.46899242425262
90
-73.6918650880999
-108.34433606086519
91
-56.780369332623174
-77.81059600986622
93
-313.05656648474917
-322.8921140776503
100
-92.08321153193545
-152.50328746

-416.62467877920807
-523.8533074934095
662
-164.54675958155255
-209.57639667773736
663
-340.5323800931971
-427.5235985230402
667
-113.16321105007223
-152.00799802540033
673
-356.1394269334312
-442.3555678303741
678
-67.86187822772676
-84.4894162422945
679
-135.691092579775
-161.31450192265507
685
-134.97261621305432
-153.62830507979626
693
-138.68865069641865
-170.48284979740396
698
-217.37911897654539
-214.10939088915381
701
-157.83339570738855
-244.63008847810997
703
-97.50393247141834
-142.89743274541263
704
-121.58441808777403
-156.25566876978417
706
-182.12386519642465
-194.94504706342255
716
-175.6529652508252
-211.16438966621078
728
-452.68344432969883
-566.5217865273595
731
-191.5285976511671
-246.54706554887665
732
-368.82965050302334
-463.98276679502055
734
-75.04162778254847
-98.43509573660558
736
-269.45362900896026
-294.6611492345468
738
-87.13872722135083
-108.73530495541777
739
-389.6170267314695
-440.2499076571706
740
-734.648265190157
-680.4359268681892
743
-179.202624

-69.87136529284646
-108.97793478434834
1372
-114.14411482125101
-150.40736409738835
1375
-331.738473362925
-465.1599990929476
1377
-466.3085296225979
-578.0033224470249
1380
-269.38170413510574
-261.50285710230236
1381
-476.30968492550534
-488.98774034680326
1383
-54.4411155038068
-92.1069341618135
1386
-471.121301412291
-466.18303730913607
1393
-54.4411155038068
-92.1069341618135
1395
-77.42429704200515
-113.38550422495308
1399
-73.73842202130918
-107.17158408501452
1400
-71.26895920921928
-110.4820121811234
1401
-116.76656695565784
-137.12697305894125
1409
-175.93505857340114
-208.24629906702327
1419
-65.01409210853018
-104.73995720556519
1426
-86.44003115631465
-114.98182185145292
1427
-678.809019659253
-763.148359819242
1430
-508.4674022132333
-598.2688914204991
1435
-1697.7770717959822
-1521.2228816676325
1441
-56.609742814172506
-75.93056326685283
1446
-184.81301839424563
-262.3966830683115
1447
-125.85282313368185
-131.13751239348665
1448
-54.4411155038068
-92.1069341618135
1450

-264.07909242745245
-322.1048410551048
2043
-1196.1355268375276
-1136.0558520775778
2044
-427.0774526299189
-422.73941418758795
2045
-57.08484216130454
-99.06252677021409
2048
-366.8149343118245
-453.47623906860076
2050
-269.6616357290447
-360.90176412253453
2051
-213.6104264192072
-290.65145432639173
2053
-2231.2932909480187
-2002.9214643503751
2056
-672.6558690113326
-957.1032456948346
2057
-109.55457456861855
-117.51669047348545
2068
-369.8311015778161
-351.463452642112
2071
-55.258262926915386
-92.10693416181651
2080
-93.98154403257729
-132.28948742117007
2088
-101.6012325150743
-126.20922177842486
2089
-417.9056473436704
-525.0613138234695
2090
-136.88819442863772
-164.8920834939297
2092
-449.76695750690357
-525.0458081885055
2093
-274.80039524850173
-355.31853061281413
2094
-707.3121794590195
-597.4595363825837
2102
-150.86841515648504
-190.12024473778524
2104
-119.26242286713271
-169.16658293185625
2105
-123.14005418035323
-148.13337875966
2111
-74.20356857667905
-102.9922914675

-136.57318649548571
-227.11071305104088
2670
-452.22893923266537
-423.57443504824187
2671
-317.5835261915697
-326.89368859295104
2673
-84.67721640815368
-111.387078952127
2682
-53.93982657733632
-91.52093676061124
2683
-70.18137110402698
-105.48285795140782
2684
-269.3322675384963
-317.29142301396604
2691
-300.0867816645673
-362.6251124783994
2692
-202.04244504753666
-253.94105631043251
2694
-639.906988927394
-682.1067821815906
2695
-639.906988927394
-682.1067821815906
2699
-545.86789998169
-513.3435566368072
2705
-89.42494181323727
-127.64509652202628
2709
-224.43806280165427
-284.1470010123594
2712
-1873.2863085301865
-1728.3874339493486
2719
-53.93982657733632
-91.52093676061124
2721
-136.57318649548571
-227.11071305104088
2723
-197.5316904114628
-271.8882359553341
2724
-197.5316904114628
-271.8882359553341
2729
-175.10860986580843
-210.1409878440708
2733
-72.99354125324145
-87.8141508047774
2734
-53.93982657733632
-91.52093676061124
2737
-134.7236823645415
-222.10868613148122
2741


-101.67778417397832
-124.52036897244942
3322
-218.21678811323633
-208.42516096103805
3329
-901.597028458889
-918.0608241066717
3330
-244.11052572938618
-269.40988375441117
3343
-83.64901558596581
-117.59871012428695
3347
-140.24075278448686
-164.4705318480537
3348
-306.2978400137161
-373.1690972988603
3350
-310.9554772620898
-365.8401143227672
3352
-129.29517629913138
-159.23495821544196
3353
-189.10672491989666
-254.47305811844217
3356
-283.06461010140345
-319.8681401118078
3359
-151.52223311284314
-173.03551365155639
3360
-226.69977187271286
-282.9935359634683
3361
-288.9784726670066
-340.06198210157544
3365
-213.94695268118147
-229.0708161973429
3366
-65.70932824737092
-84.10719046415741
3367
-270.05987061772043
-330.0242892892092
3368
-355.0111627289502
-431.4697329935692
3369
-135.99660426051958
-163.82606004672988
3371
-103.99875280592795
-116.74950007692203
3374
-79.78678704922869
-98.88560910765798
3376
-224.2067972024817
-278.0093263273081
3380
-89.34375117514827
-107.40350410

-780.8964690267833
-525.3511112148299
3951
-201.44271405505054
-174.20289397403252
3956
-42.104231624873115
-56.182097624362854
3960
-821.9625510078092
-645.1886552150928
3970
-156.1346132589008
-139.69827748394974
3972
-1723.7190976482707
-1399.2710615044332
3976
-165.95206524028768
-135.85664366511534
3977
-428.58746237823635
-346.53236825825803
3981
-1727.1225713258339
-1543.2525473859437
3982
-236.52446750683418
-195.1233618687429
3983
-463.4860218494483
-351.2077162101654
3990
-156.12322445203725
-141.95260257553838
3993
-148.11615226485128
-136.7097772975425
3995
-260.79550429012147
-206.68589649864117
4006
-427.7213120877552
-346.2655729584609
4008
-2415.802936680732
-2155.7297847989084
4011
-1061.0631775000563
-891.6101216726629
4012
-761.0106806928796
-646.3133923246587
4019
-263.2984543904652
-212.23563312348492
4021
-2628.9483096887957
-2180.926870923567
4025
-1417.595208639984
-1168.9053533868064
4029
-471.2424877817283
-422.40700076481875
4032
-220.75384150311365
-201.0174

-858.7968915123731
-731.343210087382
4589
-140.24831110972906
-116.59211886031471
4591
-288.2332639132913
-250.06848686706195
4592
-458.18024836103154
-390.0578863549754
4593
-37.65766646906167
-49.92156059405226
4600
-458.29797130395076
-369.8283007275138
4604
-651.4958043126906
-555.0968246598504
4607
-1374.035429699871
-1018.2155427585266
4610
-547.3373360790183
-475.3819212373548
4611
-97.45799274929432
-87.70060084710903
4615
-268.28542042779844
-216.23481777483
4617
-140.67753630805856
-129.79463832828904
4618
-864.4725221614149
-567.6915516706808
4622
-228.3741110738545
-190.38089001689488
4629
-123.70379211643177
-128.54231584900353
4631
-441.4748603357098
-374.25885701356236
4634
-770.5978862402667
-485.25123600985955
4636
-768.7849287608367
-477.8028052886085
4645
-158.17038421136297
-145.08728673482864
4650
-345.69714857572416
-317.99589815613615
4652
-1645.5989506075641
-1248.1255073578425
4654
-4654.5449248498335
-4128.597808151224
4658
-2232.0088319639576
-1701.6670197794

In [241]:
#accuracy
len(test_data_bayes[test_data_bayes["SPAM_LABEL_y"] == test_data_bayes["SPAM_LABEL"]].index)/len(test_data_bayes.index)

0.9179251477347341