In [214]:
import pandas as pd
import numpy as np

In [150]:
import spacy

from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")


In [151]:
def preprocess(text):
    doc = nlp(text)
    
    no_stop_words = [token.text for token in doc if not token.is_stop]
    return " ".join(no_stop_words)            


In [152]:
df=pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [153]:
df["Message"] = df.Message.apply(preprocess)

In [154]:
df.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [155]:
df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [156]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"jurong point , crazy .. Available bugis n grea...",0
1,ham,Ok lar ... Joking wif u oni ...,0
2,spam,Free entry 2 wkly comp win FA Cup final tkts 2...,1
3,ham,U dun early hor ... U c ...,0
4,ham,"Nah think goes usf , lives",0


In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)


In [158]:
X_test.shape

(1115,)

In [159]:
X_train.shape

(4457,)

In [160]:
y_train.shape

(4457,)

In [161]:
X_train[:10]

4028                        Yes , princess . going moan ?
793     Y?WHERE U DOGBREATH ? SOUNDING LIKE JAN C THAT...
1616                                           Mm food da
4488    Miss miss khelate kintu opponenter miss dhorte...
918     people msgs , think Iam addicted msging ... wr...
2241                                              K close
5057                   Goodnight da thangam miss u dear .
4820                        Got smaller capacity ? ex ...
2218                          * cartons u pleased shelves
3350                                         Oh ! brand ?
Name: Message, dtype: object

In [162]:
y_train[:10]

4028    0
793     0
1616    0
4488    0
918     0
2241    0
5057    0
4820    0
2218    0
3350    0
Name: spam, dtype: int64

In [163]:
X_test[:10]

2639                                    gobi arts college
1415                  wats ur opinion abt abt character ?
4132    Ur luck Love . Ur fortune Love Loves U. , mira...
1342    chill 6hrs . sleep pain surgical emergency unf...
4166    Dear Voucher Holder , claim weeks offer , PC h...
1302                  tot u reach liao . said t - shirt .
5156    Sir , need Velusamy sir date birth company ban...
540                         tirupur da , started office .
3524                                    Try neva mate ! !
589                                Ya srsly better yi tho
Name: Message, dtype: object

In [164]:
y_test[:10]


2639    0
1415    0
4132    0
1342    0
4166    1
1302    0
5156    0
540     0
3524    0
589     0
Name: spam, dtype: int64

In [165]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train)


In [166]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [168]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.96      0.93      0.94       143

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [169]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [170]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])



In [171]:
clf.fit(X_train, y_train)

In [172]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.96      0.93      0.94       143

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [173]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [174]:
email_ham=["reward money click here"]
email_spam_count=v.transform(email_ham)
model.predict(email_spam_count)

array([1])

In [175]:
email_ham=["How are you how is your study's going on"]
email_spam_count=v.transform(email_ham)
model.predict(email_spam_count)

array([0])

In [176]:
email_ham=["Congratulations you have won the lottery click this link below"]
email_spam_count=v.transform(email_ham)
model.predict(email_spam_count)

array([1])

In [177]:
from sklearn.metrics import confusion_matrix


In [178]:
confusion_matrix(y_test, y_pred)

array([[966,   6],
       [ 10, 133]])

In [179]:
import string
string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [180]:
x_train_punc= X_train.apply(lambda x:remove_punctuation(x))
x_test_punc= X_test.apply(lambda x:remove_punctuation(x))

In [181]:
x_train_punc.tail(4439)


473      meant money enters account  bank remove flat ...
5129    Rose red  red blood  blood heart  heart u u  S...
1100    ne thing interesting  good birthday  u wrking ...
4527    want cock  hubby away  need real man 2 satisfy...
895          g class early tomorrow trying smoke    ltgt 
                              ...                        
344                                      interested like 
4227    Ok s cool   raglan rd edward rd  cricket groun...
2262                                      wot u c 4 dust 
5543                           U nt got urself jacket ah 
5484          picking points  going 2 yeovil  motor pr...
Name: Message, Length: 4439, dtype: object

In [182]:
X_test_cv = v.transform(X_test)def remove_punctuation(text):
  if(type(text)==float):
    return text
  ans=""  
  for i in text:     
    if i not in string.punctuation:
      ans+=i    
  return ans

In [183]:
x_test_punc.head()

2639                                    gobi arts college
1415                   wats ur opinion abt abt character 
4132    Ur luck Love  Ur fortune Love Loves U  miracle...
1342    chill 6hrs  sleep pain surgical emergency unfo...
4166    Dear Voucher Holder  claim weeks offer  PC htt...
Name: Message, dtype: object

In [184]:
X_train.tail()

344                                     interested like .
4227    Ok s cool . , raglan rd edward rd . cricket gr...
2262                                     wot u c 4 dust ?
5543                          U nt got urself jacket ah ?
5484    , ,     picking points | going 2 yeovil | moto...
Name: Message, dtype: object

In [185]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv_punc = v.fit_transform(x_train_punc.values)

In [186]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv_punc, y_train)

In [187]:
X_test_cv_punc = v.transform(X_test)

In [188]:
from sklearn.metrics import classification_report

y_pred_punc = model.predict(X_test_cv_punc)

print(classification_report(y_test, y_pred_punc))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       972
           1       0.94      0.93      0.94       143

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [189]:
confusion_matrix(y_test, y_pred_punc)

array([[964,   8],
       [ 10, 133]])

In [190]:
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 2))),        
     ('Multi NB', MultinomialNB())         
])

clf.fit(X_train, y_train)


y_pred_2_gram = clf.predict(X_test)


print(classification_report(y_test, y_pred_2_gram))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       972
           1       0.97      0.92      0.95       143

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [191]:
confusion_matrix(y_test, y_pred_2_gram)

array([[968,   4],
       [ 11, 132]])

In [192]:
clf = Pipeline([
    ('vectorizer_1_3_gram', CountVectorizer(ngram_range = (1, 3))),       
     ('Multi NB', MultinomialNB())         
])


clf.fit(X_train, y_train)


y_pred_3_gram = clf.predict(X_test)


print(classification_report(y_test, y_pred_3_gram))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       972
           1       0.98      0.92      0.95       143

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [193]:
confusion_matrix(y_test, y_pred_3_gram)

array([[969,   3],
       [ 11, 132]])

In [194]:
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range = (1, 3))),        #using the ngram_range parameter 
     ('Multi NB', MultinomialNB())         
])

clf.fit(x_train_punc, y_train)


y_pred_3_gram_punc = clf.predict(X_test)


print(classification_report(y_test, y_pred_3_gram_punc))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       972
           1       0.97      0.92      0.95       143

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [195]:
confusion_matrix(y_test, y_pred_3_gram_punc)

array([[968,   4],
       [ 11, 132]])

In [196]:
df_2=pd.read_csv("./datasets/TREC_07.csv")

In [197]:
df_2.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Tomas Jacobs <RickyAmes@aol.com>,the00@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 21:00:48 +0300","Generic Cialis, branded quality@",\n\n\n\n\n\n\nDo you feel the pressure to perf...,1,0
1,Yan Morin <yan.morin@savoirfairelinux.com>,debian-mirrors@lists.debian.org,"Sun, 08 Apr 2007 12:52:30 -0400",Typo in /debian/README,"Hi, i've just updated from the gulus and I che...",0,1
2,Sheila Crenshaw <7stocknews@tractionmarketing....,the00@plg.uwaterloo.ca,"Sun, 08 Apr 2007 17:12:19 +0000",authentic viagra,Mega authenticV I A G R A $ DISCOUNT priceC...,1,1
3,Stormy Dempsey <vqucsmdfgvsg@ruraltek.com>,opt4@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 17:15:47 -0100",Nice talking with ya,"\nHey Billy, \n\nit was really fun going out t...",1,1
4,"""Christi T. Jernigan"" <dcube@totalink.net>",ktwarwic@speedy.uwaterloo.ca,"Sun, 08 Apr 2007 19:19:07 +0200",or trembling; stomach cramps; trouble in sleep...,"\nsystem"" of the home. It will have the capab...",1,0


In [198]:
new_test=df_2.label

In [199]:
new_test

0        1
1        0
2        1
3        1
4        1
        ..
53752    1
53753    1
53754    0
53755    0
53756    1
Name: label, Length: 53757, dtype: int64

In [200]:
new_train=df_2.body

In [201]:
X_train_cv_new = v.transform(new_train)

In [202]:
predict_neww=model.predict(X_train_cv_new)

In [203]:



print(predict_neww)

[0 0 1 ... 0 0 0]


In [204]:
print(new_test)

0        1
1        0
2        1
3        1
4        1
        ..
53752    1
53753    1
53754    0
53755    0
53756    1
Name: label, Length: 53757, dtype: int64


In [205]:
print(classification_report(new_test,predict_neww))

              precision    recall  f1-score   support

           0       0.42      0.64      0.51     24358
           1       0.48      0.28      0.35     29399

    accuracy                           0.44     53757
   macro avg       0.45      0.46      0.43     53757
weighted avg       0.46      0.44      0.42     53757



In [206]:
confusion_matrix(new_test,predict_neww)

array([[15617,  8741],
       [21280,  8119]])

In [207]:
df_2.label.value_counts()

label
1    29399
0    24358
Name: count, dtype: int64

In [208]:
from sklearn.model_selection import train_test_split

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(df_2.body, df_2.label, test_size=0.2)


In [209]:
X_train_new

17484     SatCon Technology\nCorporation \n\nStock Quot...
7620     On Thursday 19 April 2007 03:53, Paolo wrote:\...
33291    ----------------------------------------------...
2496     On 4/11/07, Brendan Connors  wrote:\n> Hi R-us...
42652    \n[]\n\nHe was wounded by the Orcs, and many o...
                               ...                        
7648     \n\n The following was sent to you by 800west\...
20662    At file:///home/jelmer/bzr.samba/python/\n\n--...
29002    URL: http://build.samba.org/\n\n--- /home/buil...
43371    \n\nSeize the opportunity! – Anatrim – The up-...
Name: body, Length: 43005, dtype: object

In [210]:
from sklearn.feature_extraction.text import CountVectorizer

v_new = CountVectorizer()

X_train_new = v_new.fit_transform(X_train_new)

In [211]:
from sklearn.naive_bayes import MultinomialNB

model_new = MultinomialNB()
model_new.fit(X_train_new, y_train_new)

In [212]:
x_test_new = v_new.transform(X_test_new)

In [213]:
from sklearn.metrics import classification_report

y_pred_new = model_new.predict(x_test_new)

print(classification_report(y_test_new, y_pred_new))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      4805
           1       0.99      0.96      0.97      5947

    accuracy                           0.97     10752
   macro avg       0.97      0.97      0.97     10752
weighted avg       0.97      0.97      0.97     10752

