In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv("spam.csv", encoding= 'latin-1') [['v1','v2']]
df.columns =['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.groupby('label'). describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [4]:
import string 
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
def process(text):
    #lowercasing the text
    text= text.lower()
    #remove punctuation
    text =''.join([t for t in text if t not in string.punctuation])
    #remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    #stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    #return token list
    return text

In [5]:
process('The IPL has started . I can\'t wait for Dhoni !!!!')

['ipl', 'start', 'cant', 'wait', 'dhoni']

In [6]:
df['message'][:20].apply(process)

0     [go, jurong, point, crazi, avail, bugi, n, gre...
1                          [ok, lar, joke, wif, u, oni]
2     [free, entri, 2, wkli, comp, win, fa, cup, fin...
3         [u, dun, say, earli, hor, u, c, alreadi, say]
4     [nah, dont, think, goe, usf, live, around, tho...
5     [freemsg, hey, darl, 3, week, word, back, id, ...
6     [even, brother, like, speak, treat, like, aid,...
7     [per, request, mell, mell, oru, minnaminungint...
8     [winner, valu, network, custom, select, receiv...
9     [mobil, 11, month, u, r, entitl, updat, latest...
10    [im, gonna, home, soon, dont, want, talk, stuf...
11    [six, chanc, win, cash, 100, 20000, pound, txt...
12    [urgent, 1, week, free, membership, å£100000, ...
13    [ive, search, right, word, thank, breather, pr...
14                                       [date, sunday]
15    [xxxmobilemovieclub, use, credit, click, wap, ...
16                                     [oh, kim, watch]
17    [eh, u, rememb, 2, spell, name, ye, v, nau

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidfv = TfidfVectorizer(analyzer=process) 
data = tfidfv.fit_transform(df['message'])

In [9]:
from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import MultinomialNB
spam_filter = Pipeline([
    ('vectorizer',TfidfVectorizer(analyzer=process)),  #messages to weighted TFIDF score
    ('classifier', MultinomialNB())                    #train on TFIDF vectors with Naive Bayes
])

In [10]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(df['message'],df['label'],test_size=0.20,random_state= 24)

In [11]:
spam_filter.fit(x_train , y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer=<function process at 0x0000019928BFA828>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False

In [12]:
predictions = spam_filter.predict(x_test)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(predictions , y_test))

              precision    recall  f1-score   support

         ham       1.00      0.96      0.98      1013
        spam       0.70      0.99      0.82       102

    accuracy                           0.96      1115
   macro avg       0.85      0.97      0.90      1115
weighted avg       0.97      0.96      0.96      1115



In [15]:
def detect_spam(s):
    return spam_filter.predict([s])[0]

In [16]:
detect_spam("Congratulations!! You have been awarded 200 rupees in a lucky draw by Samsung Mobiles. Click the link to know more")

'spam'

In [17]:
detect_spam("Call me ASAP  ! Project meeting scheduled today afternoon")

'ham'