In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *

In [51]:
df = pd.read_csv('data.csv')

df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [52]:
df.drop_duplicates(inplace = True)
df.shape

(5695, 2)

In [53]:
pd.DataFrame(df.isnull().sum())

Unnamed: 0,0
text,0
spam,0


In [54]:
df = df.sample(frac=1).reset_index(drop=True)

In [55]:
df.head()


Unnamed: 0,text,spam
0,Subject: met office presentation . . . vince ...,0
1,Subject: marketpoint business plan summary vi...,0
2,"Subject: yen outlook vince , as a followup t...",0
3,Subject: enron projects by team enron tiger t...,0
4,Subject: mg metals : quant analysis & risk hi...,0


In [56]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [57]:
lmtzr = WordNetLemmatizer()
with open('english.txt', 'r') as f:
  text = f.readlines()
eng_words_ubuntu = set([lmtzr.lemmatize(x.strip().lower().replace('\'s', '')) for x in text] )
stopwords = set(stopwords.words('english'))
words = eng_words_ubuntu.difference(stopwords)
print(len(words))


56622


In [59]:
tr = re.sub('[^a-zA-Z0-9]+', ' ',df['text'][0])
tr = tr.lower().split()
tr = ' '.join(list(filter(lambda text:text in words,tr)))


In [60]:
tr

'subject met office presentation vince keep informed european effort steve good job taking bull asked rapidly build client base associated support system looking good mike forwarded mike enron north america corp stephen bennett enron annette harris lon tony hamilton enron enron mike jose marquez corp enron enron subject met office presentation annette wanted drop quick line thank invitation met presentation today tony currently trying get grasp require way weather information building support structure need close met office well data information presented today helpful like take little time sit whose weather driven like get feel data already streaming get idea utilize supplement data europe model created houston start want make sure tailor would like take time sit chat perhaps tony take others lunch afternoon coffee thanks help stephen bennett senior meteorologist enron research london april tony hamilton meteorology manager enron research'

In [61]:
df['text'] = df['text'].map(lambda text:re.sub('[^a-zA-Z0-9]+', ' ',text)).apply(lambda x: (x.lower()).split())
df['text']= df['text'].apply(lambda text_list:' '.join(list(filter(lambda text:text in words,text_list))))

In [62]:
df.text[0]

'subject met office presentation vince keep informed european effort steve good job taking bull asked rapidly build client base associated support system looking good mike forwarded mike enron north america corp stephen bennett enron annette harris lon tony hamilton enron enron mike jose marquez corp enron enron subject met office presentation annette wanted drop quick line thank invitation met presentation today tony currently trying get grasp require way weather information building support structure need close met office well data information presented today helpful like take little time sit whose weather driven like get feel data already streaming get idea utilize supplement data europe model created houston start want make sure tailor would like take time sit chat perhaps tony take others lunch afternoon coffee thanks help stephen bennett senior meteorologist enron research london april tony hamilton meteorology manager enron research'

In [63]:
df.to_csv('spam.csv', index=False)

In [64]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(df['text'],df['spam'], test_size = 0.20, shuffle=True)

In [109]:
def perform(classifiers, vectorizers):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(xtrain)
            classifier.fit(vectorize_text, ytrain)
    
            # score
            vectorize_text = vectorizer.transform(xtest)
            score = classifier.score(vectorize_text, ytest)
            name = classifier.__class__.__name__ + '_with_' + vectorizer.__class__.__name__ 
            print(name, score)
            filename = f'models/{name}.pkl'
            pickle.dump(classifier, open(filename, 'wb'))
            
        if score > max_score:
            max_score = score
            max_name = name

    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

list of various classifiers we are going to use

In [110]:
classifiers = [
        BernoulliNB(),
        MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier(),
        LogisticRegression()
        
]

list of various vectorizers we are going to use

In [111]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer()
    ]

perform classification and save results to a new dataframe

In [112]:
perform(
    classifiers,
    vectorizers
)

BernoulliNB_with_CountVectorizer 0.9885864793678666
BernoulliNB_with_TfidfVectorizer 0.9885864793678666
MultinomialNB_with_CountVectorizer 0.9868305531167691
MultinomialNB_with_TfidfVectorizer 0.9332748024582967
RandomForestClassifier_with_CountVectorizer 0.9754170324846356
RandomForestClassifier_with_TfidfVectorizer 0.9736611062335382
AdaBoostClassifier_with_CountVectorizer 0.9683933274802459
AdaBoostClassifier_with_TfidfVectorizer 0.9683933274802459
BaggingClassifier_with_CountVectorizer 0.9789288849868305
BaggingClassifier_with_TfidfVectorizer 0.9701492537313433




ExtraTreesClassifier_with_CountVectorizer 0.9569798068481123
ExtraTreesClassifier_with_TfidfVectorizer 0.9561018437225637
GradientBoostingClassifier_with_CountVectorizer 0.9604916593503073
GradientBoostingClassifier_with_TfidfVectorizer 0.9596136962247586
DecisionTreeClassifier_with_CountVectorizer 0.9604916593503073
DecisionTreeClassifier_with_TfidfVectorizer 0.9587357330992098




CalibratedClassifierCV_with_CountVectorizer 0.9841966637401229




CalibratedClassifierCV_with_TfidfVectorizer 0.990342405618964




PassiveAggressiveClassifier_with_CountVectorizer 0.9850746268656716




PassiveAggressiveClassifier_with_TfidfVectorizer 0.9920983318700615
RidgeClassifier_with_CountVectorizer 0.9420544337137841
RidgeClassifier_with_TfidfVectorizer 0.9868305531167691
RidgeClassifierCV_with_CountVectorizer 0.9561018437225637
RidgeClassifierCV_with_TfidfVectorizer 0.9859525899912204




SGDClassifier_with_CountVectorizer 0.9798068481123793




SGDClassifier_with_TfidfVectorizer 0.9885864793678666
OneVsRestClassifier_with_CountVectorizer 0.9850746268656716
OneVsRestClassifier_with_TfidfVectorizer 0.9894644424934153




OneVsRestClassifier_with_CountVectorizer 0.990342405618964




OneVsRestClassifier_with_TfidfVectorizer 0.9824407374890255
KNeighborsClassifier_with_CountVectorizer 0.9113257243195786
KNeighborsClassifier_with_TfidfVectorizer 0.9771729587357331




LogisticRegression_with_CountVectorizer 0.990342405618964




LogisticRegression_with_TfidfVectorizer 0.9824407374890255
PassiveAggressiveClassifier_with_TfidfVectorizer 0.9920983318700615


In [119]:
tfidf = TfidfVectorizer()
tfidf = tfidf.fit(xtrain)
pickle.dump(tfidf, open('vectors/TfidfVectorizer.pkl', "wb"))

In [123]:
model = pickle.load(open('models/PassiveAggressiveClassifier_with_TfidfVectorizer.pkl', 'rb'))
vectorizer = pickle.load(open('vectors/TfidfVectorizer.pkl', 'rb'))

In [153]:
SMS = ' won a 1 week FREE membership in our $100,000 Prize Jackpot! Txt the word: C'
ham = "Subject: rabi de phone interview  shirley ,  let ' s act on it .  vince  - - - - - - - - - - - - - - - - - - - - - - forwarded by vince j kaminski / hou / ect on 07 / 07 / 2000  05 : 07 pm - - - - - - - - - - - - - - - - - - - - - - - - - - -  zimin lu  07 / 07 / 2000 01 : 51 pm  to : vince j kaminski / hou / ect @ ect  cc :  subject : rabi de phone interview  vince ,  we had phone interview with rabi de . my impression is good . we should invite  him for a formal interview .  he is a hands on person with wide range of experience ( energy financing ,  derivatives trading , hedging , etc ) .  he communicates very well and expressed interest in financial engineering &  modeling .  zimin"

tr = re.sub('[^a-zA-Z0-9]+', ' ',ham)
tr = tr.lower().split()
tr = ' '.join(list(filter(lambda text:text in words,tr)))

vectorize_message = vectorizer.transform([tr])
predict = model.predict(vectorize_message)[0]
print(vectorize_message)

  (0, 13453)	0.17629589784128108
  (0, 13399)	0.10442211172613461
  (0, 13188)	0.24061493978879636
  (0, 12544)	0.1222819769102806
  (0, 11855)	0.06992548671492486
  (0, 11120)	0.10703465189454263
  (0, 9822)	0.17586157246271858
  (0, 9036)	0.32898835604396726
  (0, 8977)	0.13187704191438276
  (0, 7912)	0.15748754002160964
  (0, 7080)	0.08542845836512303
  (0, 6517)	0.15294675088134327
  (0, 6457)	0.5057399007767589
  (0, 6417)	0.1204653218931254
  (0, 6091)	0.19135725037940082
  (0, 5678)	0.18355553966292937
  (0, 5347)	0.10199128984109639
  (0, 5009)	0.09075306283213497
  (0, 4971)	0.19343485617874404
  (0, 4803)	0.20621233244690734
  (0, 4801)	0.12540049904064163
  (0, 4566)	0.18753934355961221
  (0, 4514)	0.14222261926148544
  (0, 4200)	0.15317159244952122
  (0, 4187)	0.10776560695033563
  (0, 2360)	0.29109307095689124
  (0, 146)	0.16330501657674615


In [154]:
if predict == 0:
    print ('ham')
else:
    print ('spam')

ham


In [155]:
predict

0

In [156]:
tr

'subject phone interview shirley let act vince forwarded vince j vince j subject phone interview vince phone interview impression good invite formal interview person wide range experience energy financing trading hedging communicates well expressed interest financial engineering modeling'