In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from html.parser import HTMLParser
import re
import os
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pickle
from sklearn.svm import SVR

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if(tag=='a'):
            for (att, val) in attrs:
                if (att=='href'):
                    self.fed.append(' hreflink ')
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


vocab = []
word_list = list(set(words.words()))
labels = []

def read_unlabeled_email(filename):
    try:
        with open(filename, 'r') as File:  
            return re.sub(r'[^a-zA-Z]', ' ', strip_tags(re.sub(r'http\S+', ' hreflink ', File.read()))).strip().lower()
    except UnicodeDecodeError:
        print('couldnt read {!s}',filename)
        return None
def read_email(filename,label):
    print(filename)
    try:
        with open(filename, 'r') as File:  
            
            email = re.sub(r'[^a-zA-Z]', ' ', strip_tags(re.sub(r'http\S+', ' hreflink ', File.read()))).strip().lower()
            for word in word_tokenize(email):
                if (word in word_list) and (len(word) >= 2):
                    vocab.append(word)
            labels.append(label)
            return email
    except UnicodeDecodeError:
        print('couldnt read {!s}',filename)
        return None


corpus = []
spam_train_path = './training/spam/'
for filename in os.listdir(spam_train_path):
    em = read_email((spam_train_path+filename),1)
    if (em is None):
        continue
    corpus.append(em)
    
    
ham_train_path = './training/ham/'
for filename in os.listdir(ham_train_path):
    em = read_email((ham_train_path+filename),0)
    if (em is None):
        continue
    corpus.append(em)


vocab = list(set(vocab))
print(len(vocab))

#saving vars
f = open('./vars/corpus.pckl', 'wb')
pickle.dump(corpus, f)
f.close()

f = open('./vars/vocab.pckl', 'wb')
pickle.dump(vocab, f)
f.close()

f = open('./vars/labels.pckl', 'wb')
pickle.dump(labels, f)
f.close()


    




./training/spam/00001.317e78fa8ee2f54cd4890fdc09ba8176
./training/spam/00002.9438920e9a55591b18e60d1ed37d992b
./training/spam/00003.590eff932f8704d8b0fcbe69d023b54d
./training/spam/00004.bdcc075fa4beb5157b5dd6cd41d8887b
./training/spam/00005.ed0aba4d386c5e62bc737cf3f0ed9589
./training/spam/00006.3ca1f399ccda5d897fecb8c57669a283
./training/spam/00007.acefeee792b5298f8fee175f9f65c453
./training/spam/00008.ccf927a6aec028f5472ca7b9db9eee20
./training/spam/00009.1e1a8cb4b57532ab38aa23287523659d
./training/spam/00010.2558d935f6439cb40d3acb8b8569aa9b
./training/spam/00011.bd8c904d9f7b161a813d222230214d50
./training/spam/00012.cb9c9f2a25196f5b16512338625a85b4
./training/spam/00013.372ec9dc663418ca71f7d880a76f117a
./training/spam/00014.13574737e55e51fe6737a475b88b5052
./training/spam/00015.206d5a5d1d34272ae32fc286788fdf55
./training/spam/00016.4fb07c8dff1a5a2b4889dc5024c55023
./training/spam/00017.6430f3b8dedf51ba3c3fcb9304e722e7
./training/spam/00018.336cb9e7b0358594cf002e7bf669eaf5
./training

./training/spam/00150.957bb781217f762dc9999e7a90130c92
./training/spam/00151.6abbf42bc1bfb6c36b749372da0cffae
./training/spam/00152.a9f16de7f087215259a15322961bf9c0
./training/spam/00153.d20d157c684520f1c3aa8f270f753785
./training/spam/00154.fb13b55bdbb01e81ac9b8ee6f13948d5
./training/spam/00155.b043f6801a72ada945205709c44c8ac4
./training/spam/00156.ccb4ccdec1949b9510ab71d05122de3f
./training/spam/00157.eab255ce291e88fbee46f3f0c564ddf3
./training/spam/00158.58aea46256aaaa23787ec2acdcd31073
./training/spam/00159.6b641c70d79fd5a69b84a94b4e88150a
./training/spam/00160.66e11a17d619ce33d5a455a87015ee25
./training/spam/00161.37836bbc77bdab253f914d7a5233badb
./training/spam/00162.b5ae5521352c9bba7bf635c1766d4a75
./training/spam/00163.2ceade6f8b0c1c342f5f95d57ac551f5
./training/spam/00164.272880ebd1f1f93cf0cd9800842a24bd
./training/spam/00165.3a37220d69b5b8332ee4270f0121dfe9
./training/spam/00166.806d5398d7a37c080641a2d62e2d2b94
./training/spam/00167.c37f166df54b89ce15059415cfbe12c3
./training

./training/spam/00299.ec4bd0c57a7bf6a5616beb2897aaed7b
./training/spam/00300.6ca4fee75f6afbd00581ceec6cf14fac
./training/spam/00301.3b6fa92db458408d9468360fc034d280
./training/spam/00302.7f695c69d4cbe06e91cddd2ca7cddf33
./training/spam/00303.7d749e4a46ceb169ea1af5b9e5ab39a9
./training/spam/00304.fedb2050801439ca46d1d82d10e8e9e2
./training/spam/00305.1f2a712e25638b0a4868155b37022cf1
./training/spam/00306.9e12acdecaf3825733a1aadc2455b166
./training/spam/00307.79b64580c5c605583aec7b7a4f8679c0
./training/spam/00308.fc90f8aab51648329b9e705c9021b204
./training/spam/00309.514ba73d47cc5668a2afdef0a25b400c
./training/spam/00310.dd799961317914374088b04cc0fb1b85
./training/spam/00311.176626eb0ec0de8f451a079083104975
./training/spam/00312.5034fee7d265abd7a4194e9300de24bf
./training/spam/00313.4363902393c0037670bc9483d19dc551
./training/spam/00314.ce1926a9815415807e51b80930bffdb8
./training/spam/00315.81905f8867f52179286a6a5bdd1483fa
./training/spam/00316.6127940652124130611907ee0c20ab5e
./training

./training/spam/00448.65d06c45ea553e3fddf51645ae6b07f0
./training/spam/00449.c498bdd182edba9e9ba725c5a5a1f06b
./training/spam/00450.acfa2d7f64e43ef04600e30fdecff8ec
./training/spam/00451.bbff6de62f0340d64a044870dbedafba
./training/spam/00452.f13574a4582c94daf2bd6668c1683eed
./training/spam/00453.7ff1db57e1e39cb658661ef45b83a715
./training/spam/00454.ca6a81a702d62c23bc184a37a1cdb926
./training/spam/00455.8ccdcb205b6f8c3958bb3b2d39edca46
./training/spam/00456.c680a0c7d8d8d91bf3fb9f77ce6541b0
./training/spam/00457.f4325a4aa30dce61bf6c442b887733dd
./training/spam/00458.49467dba97d6ac2825c533df9a7a5017
./training/spam/00459.2c2341d0342bfedc94464025126bedc6
./training/spam/00460.407cd7d4ce577b1474eba6dd35a15081
./training/spam/00461.57e0fe5d31215393d7cf4e377e330082
./training/spam/00462.d0ca75a85184ca9843d1dffdc8c98855
./training/spam/00463.0bc4e08af0529dd773d9f10f922547db
./training/spam/00464.d2f719c667d192af860572b1c858cc11
./training/spam/00465.81b738fc646c03b1db38a456cd087ad7
./training

./training/spam/00599.d6d6a2edd58fa7dd6b18787e9867984b
./training/spam/00600.d113e8e0ac2d8d42f500f4faef61061b
./training/spam/00602.8b839b13833d7ca3abd6f6179ccc0286
./training/spam/00603.c76054a8ff1b75fb722602e567d2e7e1
./training/spam/00604.3dcf5f835dacff5d4a32a24eba31cbd2
./training/spam/00605.8a2e83e442d0052a2b2e9cff1ef0793c
./training/spam/00606.f078f6f48ae2e41f553882449f2aa613
./training/spam/00607.b5726a2ac74956ab9af0901379cdca5c
./training/spam/00608.bf7d77ef4f28552278f78a5aef71f0cf
./training/spam/00609.4dfe7912017772587dc62fecc3cf6553
./training/spam/00610.d78eda68ed03fa6890077bd4b805e16d
./training/spam/00611.58f9a1db64c83da2bfd3b1acf8d38338
./training/spam/00612.cd362b97ee34d41e72a66ed5199dd62e
./training/spam/00613.047cf0bcc74950bd9e1eaf8d336c385c
./training/spam/00614.974cdf353242f9286fbcc34673d9f28a
./training/spam/00615.e47bff6118d4ff6d98581fa6f40ab871
./training/spam/00616.1e86f2d3478f10a3a413844bba74f450
./training/spam/00617.f2097d6448725c371fd5f4154184ad3c
./training

./training/spam/00749.9887b1d7cb21083c777b0623cfdb02af
./training/spam/00750.dfc392478300e11189d61d29bed9cecc
./training/spam/00751.3158a29a29997cc16a69497399d90ca2
./training/spam/00752.c0892cd4ffff618e689dec28f2f4695e
./training/spam/00753.c3032ff8329006ec6b39b6c821185b1c
./training/spam/00754.9922dfbaee98abc6e1a3a00909a8d24e
./training/spam/00755.4280e5603d66801661cbd0fe0b33eec8
./training/spam/00756.b68f9bcfd782a01a2ece132eccdcbbe9
./training/spam/00757.c2da75286819a139e27438ca5c5ba762
./training/spam/00758.13f37fcfbc515a7f7de269852fb7b842
./training/spam/00759.23e678ecd735ad618ad151d311c81070
./training/spam/00760.254b8986f3d7b6cbda1cc7ce16860e6c
./training/spam/00761.00d729b279723c9ae9d8f09e171db301
./training/spam/00762.e31568dff471c947d42869ae2f8f0779
./training/spam/00763.868a503063713b62fd5325513ba29761
./training/spam/00764.d81e084a6940b58fa3deabed038a7b9e
./training/spam/00765.cfd85d27a812054ddd5ee1fc7d881557
./training/spam/00766.ff1f266127aebe1fd285b9a211f34723
./training

./training/spam/00898.8b3fe8deaa79f08133be78fc63e726c9
./training/spam/00899.957a4f3be27468470183f978db53c753
./training/spam/00900.a5f6355af8a1891e683898c5b549e565
./training/spam/00901.95250e8c5c190d1b0320b9e6fe0f5a82
./training/spam/00902.5cf0d5b3c8c28418fd5aac308db88a47
./training/spam/00903.1b151e48aafed20229a1880c1d558992
./training/spam/00904.8f93ecb6172ee1feba7b4248c48b9ef5
./training/spam/00905.8dcb590481d3e3c04d03506100c59497
./training/spam/00906.bd0b0986deaf717b1f1a689fd950b97c
./training/spam/00907.74983d9d0d6ee3c681a48cf893f123b5
./training/spam/00908.718f913e615b66df1c33cfb0af8e8885
./training/spam/00909.be44baf9966a96b2154b207cc56fe558
./training/spam/00910.49da3099938f292733872f1e59c8c460
./training/spam/00911.4332049ae6031c5a39ecefdc807961a4
./training/spam/00912.8432b3cd988d1ee656321730ce7ca056
./training/spam/00913.2c4aa5dfdd0ecb0331c674b73f40e924
./training/spam/00914.b4f1e9f517f85e68f8326f3a1525ebc2
./training/spam/00915.2f47eee6061e3c631bd51649050b6b02
./training

./training/spam/01047.2e01ca2cd13baa8ba6fe1429f39f85de
./training/spam/01048.a8581a98e46532472fe74772b6e3477a
./training/spam/01049.621d66148b023203d9010ee5df12ddd1
./training/spam/01050.f18a04fd3f7cf3e60483c3420bff5417
./training/spam/01051.a87f28b7d023a840cb54ed7aa5f4e19b
./training/spam/01052.223738f00502f5dd7f86dd559af8f795
./training/spam/01053.2ec9e69df620c85e3c7c6f61ce199a3c
./training/spam/01054.3b393992b2d9b3c2209719e6ac83da11
./training/spam/01055.6235123a9b08a94a2262000419edd68a
./training/spam/01056.c7e5574c3a036313b160b31234e7d81e
./training/spam/01057.b3913dec24f1b61e2ff45e30cdbb8fcd
./training/spam/01058.c5bf6182a887f342973383823dd22d0c
./training/spam/01059.3bb25dd5704c3e29081ef39c7cb0a200
./training/spam/01060.d72413ea3af9e1c5530a3570e0bb517e
./training/spam/01061.6200efa97d5fecf255e3849dde7a3701
./training/spam/01062.c35ba7728139dda6e85342e9e8cb2eaa
./training/spam/01063.e0318e91412291f881f315287050abe4
./training/spam/01064.50715ffeb13446500895836b77fcee09
./training

./training/spam/01195.2e0668d1365c631aa09d21c813ce013f
./training/spam/01196.b4567843ec0343475dfcb5da7bd9c41f
./training/spam/01197.f88c038612948f3fa023ac83db2e2ce5
./training/spam/01198.da6821edae608ba753c85e1a7436219b
./training/spam/01199.5a219c9b3e55a0136d8039fd74675570
./training/spam/01200.6d388843b6ffefafaf7b3093ca28a740
./training/spam/01201.471d379a63806032b2c3978838b83e61
./training/spam/01202.4ec06d178a19d7972daf54bc3ba958ff
./training/spam/01203.f4c9144a594fb81ff323053fbd626a55
./training/spam/01204.75323a3e0d38fe7a107bd0102daf6f26
./training/spam/01205.47d139ac094945ae2630efb896dc4b43
./training/spam/01206.b769233047fc696cc02ece9629190b42
./training/spam/01207.c24e5ee957e060ad511b5d2b875ff222
./training/spam/01208.6e10d44b4339eacb5db69d1afabae907
./training/spam/01209.01df2f8f68a70062085ef787973f9ba0
./training/spam/01210.3315bbc34ab0c51f53ee13e0dc6ccaec
./training/spam/01211.e9c2c3b1d544e8618d6f87c1871021e9
./training/spam/01212.216774fff566f005d1ef404eda7925e2
./training

./training/spam/01343.97e31a95126f0c6dc249a8e51489af10
./training/spam/01344.43dbbbac0d006790dd696123d455f5b9
./training/spam/01345.436954c32bbf82773e33853ac26ef881
./training/spam/01346.fb942e99ad6211fe374675bc9ac639d5
./training/spam/01347.e2cd456cd2d58601fec5a5b6323463e1
./training/spam/01348.0ed90bb4a1ba1ea2309ffdbbce093753
./training/spam/01349.71c7c224e68df0a19c1834065368c89f
./training/spam/01350.31d85a79d8c3b80069316daf56176820
./training/spam/01351.e960056da1502c90d40d598cdf37f359
./training/spam/01352.875dff8a1fd32766be05e136950add70
./training/spam/01353.369f79f8f31f3b18bdb5d1006207b52e
./training/spam/01354.942feb599d3e244a238c28c9028d97fa
./training/spam/01355.a47c042a6e16456c5b49c18d5b3868cb
./training/spam/01356.8d996c0bc08a47a90611de2e8a829048
./training/spam/01357.531b966b657f95f04a9037ea7100aa9f
./training/spam/01358.eb6c715f631ee3d22b135adb4dc4e67d
./training/spam/01359.deafa1d42658c6624c6809a446b7f369
./training/spam/01360.5bf908ff3e674f31061afcb8a6c17a8d
./training

./training/ham/0095.b51416f612ac5737e0f4a5529ce453d1
./training/ham/0096.0446f3ed63b550a8622c8671d8ae9a9c
./training/ham/0097.39badf2fea6bcebc640bea05ced59b59
./training/ham/0098.5053f669dda8f920e5300ed327cdd986
./training/ham/0099.9f54be08406e67fd8944f2f1d0fbdd90
./training/ham/0100.1728f45047ff2a1601d4e3ee91f26a00
./training/ham/0101.48557f7f38d947eab77aefc03d0520a3
./training/ham/0102.ba8679813c4b5f424fb225f09b2fb1f2
./training/ham/0103.3fc5444196f4726ee138fbabc5086ea1
./training/ham/0104.cbb1681451b2525d7aeec9eeb285306d
./training/ham/0105.be508e1b909bae328d4a13fe898f60fb
./training/ham/0106.808c5e5ed0b801f667a34ead8221972e
./training/ham/0107.2447a90f32ab7642ff8309d41c242db2
./training/ham/0108.e6e9cb097a3b5e37d94a7ff29bc4412a
./training/ham/0109.4ccc46c546b93015aafdfc40495f187d
./training/ham/0110.0bb9a36c3037be09867c0251e0fd6a3a
./training/ham/0111.e10679164e671fd9211c0303af7ee9f0
./training/ham/0112.b6c2ea75f9f7efcd3d0e4c43a751479c
./training/ham/0113.55a6bf6a4534d447af2060b174

./training/ham/0250.21660c434f8535865d04d112f8105708
./training/ham/0251.f9d83ac9ced9eab63df7da2bccb83bc8
./training/ham/0252.274e70cd31fb5c72b9e7aee909c53cb1
./training/ham/0253.f8c6edd67ecdabe677a8f5cd082e3fc5
./training/ham/0254.d40c629a02c7361e674d0d96ca130fe0
./training/ham/0255.98492176173fd6951086bb7af9e0008a
./training/ham/0256.6162c21da16ce179d6ed0238a7413a8d
./training/ham/0257.d09e74208e9cb40c0eacbb77afa80e74
./training/ham/0258.f47b7f8872a17204171574c20fb546ab
./training/ham/0259.434a3208757e9738f7af6a004f42c5f1
./training/ham/0260.b68400a28ee29cb2f24149a03db1fd9e
./training/ham/0261.45da0560dfe8183b3433e2cf2f3a7835
./training/ham/0262.4b1558b6e0d2e4edd9e8bed23afb9d40
./training/ham/0263.ffa4db454754b3c66fd025e92912941e
./training/ham/0264.a1183a59e4f0a71e80378d9404a3212f
./training/ham/0265.84c034d3d76a3d069732c02e1101fe23
./training/ham/0266.387a672eb91910a913cde6ebdf33ef05
./training/ham/0267.218a528c448166d39f853cf75d8890dd
./training/ham/0268.77ef28e27a9ee085646f260418

./training/ham/0405.ef8e0986c113ddf12faa7c1a05ace625
./training/ham/0406.9f6672caace4903c3d6180f2e95a3153
./training/ham/0407.3436b9e83d36200b38a97b7fa179b6aa
./training/ham/0408.3e476c9b90944c3b5fccdb2055bce897
./training/ham/0409.7fdf4f0f8aad0b0ad4654b10afae9225
./training/ham/0410.a5e658e20b48409116fd339f5a8473f7
./training/ham/0411.315b5c32101916ae2192760585b71763
./training/ham/0412.a141da0849f5295c2a6230fd3ed9647c
./training/ham/0413.c4ecec1b9ca5e6bd1fd9bc7fa6c29b2d
./training/ham/0414.06408c603660212989180dd086b0f460
./training/ham/0415.6dfea45b8aef7d6ef6a462974d61ce3e
./training/ham/0416.6071df16fc9f7c45f65163f5b10a16d2
./training/ham/0417.74bf147587a49c58a67490c0211f7928
./training/ham/0418.c0b6aa880f0653a8cef85a9bfb1af86c
./training/ham/0419.baeaada19ebce19874d17d5ef73ced0d
./training/ham/0420.be62f32c2be92df5f22deacf5c399407
./training/ham/0421.4df8b9eebae5cbf2d5213e2040d21c9e
./training/ham/0422.cf8753c2ab03fedf1c9a2d8fb1c1578f
./training/ham/0423.353c3de8f5c7114179771f9b44

./training/ham/0560.c422e330f8ff3ecf8fdfb5f00e691bf9
./training/ham/0561.7e0c08933efe38c91d293454976e4977
./training/ham/0562.7dea96e851b897ea041ea242a9d3c119
./training/ham/0563.cc76540c47ac49aa8dcbe801e8380b36
./training/ham/0564.d200fee3d675d75d79f14c8308d631d1
./training/ham/0565.881770e64b20fa8ee779d12aeda1fc54
./training/ham/0566.97120c2bc957702e0e1606025799951e
./training/ham/0567.41bb1a0c6c8c5584844c6baa2612ede2
./training/ham/0568.112e1928cfa88de3901d08715fd2e3b3
./training/ham/0569.ef10ce803ff212d76f9dd6148b84a823
./training/ham/0570.17bb9eeb6562425ac2aee464a84f0e4a
./training/ham/0571.d336e43297da0069689d70dec6ff7870
./training/ham/0572.ff4614fcac266492c9c3c1a9432cbe89
./training/ham/0573.afc1bc63378e7961549e4bf789461719
./training/ham/0574.2059f95c47876cb8f6db440751b0cfc9
./training/ham/0575.2f6d5a053932e0900b329a15c5f78e38
./training/ham/0576.f32636c8fb4d4f4fde3c891f9fdee39c
./training/ham/0577.7297b06fde913833e96aca4385826c29
./training/ham/0578.a91b2f2cc20b7a68d5e340cac0

./training/ham/0715.6209671b4aec7159b71320b0c462ce8e
./training/ham/0716.ce0270837e944df9396a45d6389d2225
./training/ham/0717.211b19b47bd1c85001dddbe5768d79a1
./training/ham/0718.68f8a85adbdadbd8959876992d001c5c
./training/ham/0719.078bfec7d10d50405c5213fdcfe069f4
./training/ham/0720.9b70b163d69018e7aee73cf86459265d
./training/ham/0721.9798746ace52b039545cfcb5dd5df9fa
./training/ham/0722.8aba37e84c5a58cdfe72fc9ed03089ae
./training/ham/0723.e326ddd50487b33418a053b67df5ab5d
./training/ham/0724.789e078e8d5286dcb7da0df2f47e6886
./training/ham/0725.2c1c01215a3c8c4936f8cfea181092d1
./training/ham/0726.b9cf1bbdedddc254ac0a78e9fdb22a8e
./training/ham/0727.a398f36f56d924dc775138b61dffbfb2
./training/ham/0728.7d37495ed88e5edd68f2c868e451b68e
./training/ham/0729.70d1cec4f8f949fc7ef64fc3ed85f950
./training/ham/0730.9570ee3b6bf144198297b23bca5044e9
./training/ham/0731.59e8a707586a8b3cfe89bff4024dead7
./training/ham/0732.63667434e8712ed16361596f40c468ed
./training/ham/0733.782a236e90e0e7a55b3c67be6f

./training/ham/0870.e7254d91187938f2f0b2fa1ff117f822
./training/ham/0871.79be1926ade2b8fc591f9f51abf66224
./training/ham/0872.b39e14fc64a28f30b5c413d51ed9dea1
./training/ham/0873.bd6a6b2911a0dedccee495aaf0fb248d
./training/ham/0874.f17f5355a2abf8cb83fb09069c9460bc
./training/ham/0875.990a5210c2586a027e13b10bf9d3e4ae
./training/ham/0876.22140b6b06918d04fc07c3d992a6b846
./training/ham/0877.62f5636ba5885d1b92423169c83a35b9
./training/ham/0878.24186e5267a8ec179a2e07f2da013932
./training/ham/0879.267af96ab014056d029ea42fd1ecf2bd
./training/ham/0880.6aa461d079bb0762af5c94d9af8487ec
./training/ham/0881.316c03b1dbd637537a4035f8470c6c12
./training/ham/0882.1fc35ed593366d26e06112250d18678a
./training/ham/0883.1c07a9bc574c386fbd893edbb24ea4e6
./training/ham/0884.f4a7181c5337229d1e70c587cbae9567
./training/ham/0885.edd07a1946446122321ba10a01eda39a
./training/ham/0886.a901020854d0c42772ba10e45212ee82
./training/ham/0887.95cafbddb7fce33aada0e9d9bb329aa5
./training/ham/0888.a8cf27198dd5b8612b47fe3f8f

./training/ham/1025.77dcf4ff92802ab3af94b9516d25b02a
./training/ham/1026.b2ee7b3cb90365641e465cfede58a672
./training/ham/1027.35033a975b9979f1a2eb34db590b32ac
./training/ham/1028.a139c4ec44f9dd8286a8333d934ced4e
./training/ham/1029.cfdcc9c1ab54e77b18da7164b324178a
./training/ham/1030.9a38ec315e2e8926085a560e36f977b8
./training/ham/1031.cfbae64b0894abd4ef88ddbf253fa704
./training/ham/1032.725446990feeb941bce8a383943cd2a2
./training/ham/1033.4155e06c0fb104b96ec6c5d656264fa7
./training/ham/1034.30d39e880274b0ebb59462ddfff16880
./training/ham/1035.a3cdb2fe04945379483b12640bdb19d4
./training/ham/1036.8c6e8c189738f671c5fd1cb2b1791317
./training/ham/1037.1285e23706448e51cb9be399cd0546ed
./training/ham/1038.4a0af891d4474c608eb6a3ffedeb8ada
./training/ham/1039.28e0f6b0f1e09bbcebc01183bd3557c4
./training/ham/1040.0975203bad39968cfceadfcd68a9242b
./training/ham/1041.c96194d26968c693aca4f4ef5f4a6a61
./training/ham/1042.5089423fc172a53a687d43ad6b432caf
./training/ham/1043.701d8a6b948c0e1406a85fc621

./training/ham/1180.9d441ee49eef8ba99c54ff8ec4ea9096
./training/ham/1181.e94c4c98963c99b42be08be341418d57
./training/ham/1182.3156d66c268869ee74f080a7c7523be8
./training/ham/1183.54b599ea32d753071ca44e55db7fe85d
./training/ham/1184.152999961bd0606fb0dcb6d6f95cc96d
./training/ham/1185.b83ac460b08616796da67d3bc57b76df
./training/ham/1186.fcd93eee19319006fece61d0d3a0bc13
./training/ham/1187.e5fee75dd86128e81186369567522f61
./training/ham/1188.1f6d5c1960b4d2d50e9745073559284e
./training/ham/1189.e69e458af7eefebc70d658dd2e6d23df
./training/ham/1190.99eaa82bada000ca0ada9512a3f590e2
./training/ham/1191.f8b145ebabbd42450c292e448cf44f15
./training/ham/1192.1d7f09a0119d74789f1918f9c02beb45
./training/ham/1193.8805a7218e81db9893ee2b704d9ebeb1
./training/ham/1194.ee628dd00ae3eea31232d0e78a39c9b2
./training/ham/1195.4cad51ea61eee58c9b4496ebba828692
./training/ham/1196.38c6bb13ea04559ae8d2cc19219d1ac5
./training/ham/1197.d2b11de4c8eb13e1fcc1d444665d0d1c
./training/ham/1198.b0a1f43fd2c1b88883e597b644

./training/ham/1335.57112a05c5a5ba633707ed3fdecfd88c
./training/ham/1336.7ea005648426a4f5815e0bb42230a595
./training/ham/1337.eb38bf81b8a24991252e451208c24ecd
./training/ham/1338.6656824f5801ca79629b3374e834ad04
./training/ham/1339.57de97732c190bf4575276eb054e0e79
./training/ham/1340.763507dad3b921f972640d4d76172f7e
./training/ham/1341.91bc30d50566e71807217c8977f7a793
./training/ham/1342.6770473802a7dc7ab14a212f95573c15
./training/ham/1343.c5ead41a4483bd3732e42d52c9163d24
./training/ham/1344.c12652263fc21b75dd4298a3d1bfcf53
./training/ham/1345.df3d6d8a5d9e6bba6484fbef2dae9102
./training/ham/1346.e172cd7f334b27ed0aeb288ca6436f7d
./training/ham/1347.b74ed7bb44086b2b4721bce965ac8273
./training/ham/1348.79cf2ab14db2301670710f9d1feaddda
./training/ham/1349.d05ca7b269c99915562d3e448f7b8afa
./training/ham/1350.e386afc23b51f446e2f6779c1ade0a68
./training/ham/1351.c176f7e5d80492c34cad40bc9e939012
./training/ham/1352.dde8603b55106a1e7fe9508e972b24d2
./training/ham/1353.bd9fa1be6b135e4aea436dc795

./training/ham/1490.e80cf700ef4de2385ada6ae149e7498c
./training/ham/1491.2e82a3803e51f420c6398d963052469f
./training/ham/1492.fcfeff3d31bfad092220ca731a40c05e
./training/ham/1493.0e19ac93ca8fe935323151417bc2e959
./training/ham/1494.adcdefb394b06836cbb548c4a93ee76a
./training/ham/1495.965bf18408e31d93d3fd256d2c6e8b64
./training/ham/1496.c48399a0c3abb60acd00322bdbb97565
./training/ham/1497.82b5d123cc0c3630a6ecd3d0eefe7c08
./training/ham/1498.578d31b0aab137aa9d79b455a6def5cd
./training/ham/1499.37d7a1b89dcf94a9b123ad584c2fa149
./training/ham/1500.2e6e497d8947d6125050b838efe4cf1f
./training/ham/1501.5c83f8ccf80ccff0b82b04a8e09d69ec
./training/ham/1502.1376704c3bd9c110fe3a5d0768745d91
./training/ham/1503.f1fc48f16902a097113fa60aaeb35245
./training/ham/1504.f77b2dc9ad8c875d8edc67b180e2f878
./training/ham/1505.7cc0d1e500937105c1503d63bd0b5161
./training/ham/1506.df1d0b609d034c834e290e7e3732e392
./training/ham/1507.381db954fc6fe8df9646172b53db92ad
./training/ham/1508.334b2eb6c70ba66605ebefc979

In [20]:
import time

# dumping vars to files (processing takes a long time, so it's better to do it once and save the vars into a file for later use)
f = open('./vars/corpus.pckl', 'rb')
corpus = pickle.load(f)
f.close()

f = open('./vars/vocab.pckl', 'rb')
vocab = pickle.load(f)
f.close()

f = open('./vars/labels.pckl', 'rb')
labels = pickle.load(f)
f.close()

#vectorization tf-idf style
vectorizer = TfidfVectorizer(vocabulary=vocab)
tfidf = vectorizer.fit_transform(corpus)
#print(tfidf.toarray())
print(len(vectorizer.get_feature_names()))
#print(vectorizer.idf_)


#SVM model
svr_rbf = SVR(kernel='rbf',C=10, gamma = 0.1)
start = time.time()
svr_rbf.fit(tfidf.toarray(), labels) 
end = time.time()
print('done in %.2f s' % (end - start))

12687
done in 65.45 s


In [21]:
# Cross-validation
ham_cv_corpus = []
spam_cv_corpus = []
ham_cv_path = './cross_validation/ham/'
spam_cv_path = './cross_validation/spam/'


# Testings
ham_test_corpus = []
spam_test_corpus = []
ham_test_path = './testing/ham/'
spam_test_path = './testing/spam/'


# read ham cross-validation data
i = 0
for filename in os.listdir(ham_cv_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((ham_cv_path+filename))
    if(em is None):
        continue
    ham_cv_corpus.append(em)

    
# read spam cross-validation data
i = 0
for filename in os.listdir(spam_cv_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((spam_cv_path+filename))
    if(em is None):
        continue
    spam_cv_corpus.append(em)

    
spam_cv_tfidf = vectorizer.fit_transform(spam_cv_corpus)
ham_cv_tfidf = vectorizer.fit_transform(ham_cv_corpus)

pred = svr_rbf.predict(spam_cv_tfidf.toarray())
print('performance on spam cross-validation set {0:.0%}'.format(sum(num >= 0.5 for num in pred)/len(pred)))

pred = svr_rbf.predict(ham_cv_tfidf.toarray())
print('performance on ham cross-validation set {0:.0%}'.format(sum(num < 0.5 for num in pred)/len(pred)))





performance on spam cross-validation set 97%
performance on ham cross-validation set 85%


In [22]:
# read ham test data
i = 0
for filename in os.listdir(ham_test_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((ham_test_path+filename))
    if(em is None):
        continue
    ham_test_corpus.append(em)

    
# read spam test data
i = 0
for filename in os.listdir(spam_test_path):
    #i = i+1
    #if(i>302):
    #    break
    em = read_unlabeled_email((spam_test_path+filename))
    if(em is None):
        continue
    spam_test_corpus.append(em)

    
spam_test_tfidf = vectorizer.fit_transform(spam_test_corpus)
ham_test_tfidf = vectorizer.fit_transform(ham_test_corpus)

pred = svr_rbf.predict(spam_test_tfidf.toarray())
print('performance on spam test set {0:.0%}'.format(sum(num >= 0.5 for num in pred)/len(pred)))

pred = svr_rbf.predict(ham_test_tfidf.toarray())
print('performance on ham test set {0:.0%}'.format(sum(num < 0.5 for num in pred)/len(pred)))



performance on spam test set 97%
performance on ham test set 93%
