In [1]:
import pandas as pd
import numpy as np
#import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
        #Reading the text file
    
df=pd.read_table('SMSSpamCollection.txt',header=None)
df.columns=['Flag','Message']
df['Message']

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
Flag       5572 non-null object
Message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [4]:
y=df['Flag']
y.value_counts()

ham     4825
spam     747
Name: Flag, dtype: int64

In [5]:
    #Labelling ham as 0 and Spam as 1
    
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_enc=le.fit_transform(y)
y_enc

array([0, 0, 1, ..., 0, 0, 0])

In [6]:
text=df['Message']

In [7]:
    #Replacing email addresses with emailaddr, urls with httpaddr, money symbol with moneysymb, phone numbers with 
    #phonenumbr, numbers with numbr

processed = text.str.replace(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b',
                                 'emailaddr')
processed = processed.str.replace(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)',
                                  'httpaddr')
processed = processed.str.replace(r'£|\$', 'moneysymb')    
processed = processed.str.replace(
    r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
    'phonenumbr')    
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')


    #Removing puctuations and whitespace

processed = processed.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')
processed.head(5)

0    Go until jurong point crazy Available only in ...
1                              Ok lar Joking wif u oni
2    Free entry in numbr a wkly comp to win FA Cup ...
3          U dun say so early hor U c already then say
4    Nah I don t think he goes to usf he lives arou...
Name: Message, dtype: object

In [8]:
    #Converting into lower strings

processed=processed.str.lower()

In [9]:
    #Removing stopwords
    
import nltk 
from nltk.corpus import stopwords
stopwords=nltk.corpus.stopwords.words('english')

In [10]:
processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in set(stopwords)))

In [11]:
processed.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry numbr wkly comp win fa cup final tk...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: Message, dtype: object

In [12]:
    #Stemming (Stemming is the process of reducing the words to their root form like fishing into fish, available
    # into avail)
    
from nltk.stem import PorterStemmer
porter=nltk.PorterStemmer()
processed=processed.apply(lambda x: ' '.join(porter.stem(term) for term in x.split()))
    

In [13]:
#Defining a function for all the above

import re

def preprocess_text(messy_string):
    assert(type(messy_string) == str)
    cleaned = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', messy_string)
    cleaned = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr',
                     cleaned)
    cleaned = re.sub(r'£|\$', 'moneysymb', cleaned)
    cleaned = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b',
        'phonenumbr', cleaned)
    cleaned = re.sub(r'\d+(\.\d+)?', 'numbr', cleaned)
    cleaned = re.sub(r'[^\w\d\s]', ' ', cleaned)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    cleaned = re.sub(r'^\s+|\s+?$', '', cleaned.lower())
    return ' '.join(
        porter.stem(term) 
        for term in cleaned.split()
        if term not in set(stopwords)
    )


In [14]:
(processed == text.apply(preprocess_text)).all()


True

In [15]:
    #Feature Engineering

from sklearn.feature_extraction.text import  TfidfVectorizer
vectorizer=TfidfVectorizer(ngram_range=(1,2))
X_ngrams=vectorizer.fit_transform(processed)
X_ngrams.shape

(5572, 36348)

In [16]:
    #Training and evaluating the model
    
from sklearn.model_selection import train_test_split
from sklearn import svm
X_train,X_test,y_train,y_test=train_test_split(X_ngrams,y_enc,test_size=0.2,random_state=42,stratify=y_enc)

In [17]:
clf=svm.LinearSVC(loss='hinge')
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
#support vector machine or NaiveBAyes or Multinomial Naive Bayes

In [18]:
    #Confusion Metric
    
from sklearn import metrics
pd.DataFrame(metrics.confusion_matrix(y_test,y_pred),index=[['actual','actual'],['spam','ham']],columns=[['predicted','predicted'],['spam','ham']])

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,spam,ham
actual,spam,965,1
actual,ham,19,130


In [19]:
def spam_filter(message):
    if clf.predict(vectorizer.transform([preprocess_text(message)])):
        return 'spam'
    else:
        return 'not spam'

In [49]:
spam_filter('Ohh, You won 50,000')

'spam'

In [50]:
spam_filter('Hey,This is Akshay Khatter')  

'not spam'

In [51]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.98206278026905824