# Importing the libraries

In [1]:
import pandas as pd

# Reading the dataset and creating a dataframe

In [2]:
df=pd.read_csv('SMSSpamCollection.txt',sep='\t',names=['labels','sms'],header=None)
df.head()

Unnamed: 0,labels,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Importing nltk and string module
> nltk lib will be used for identifying the stopwords and punctuations

In [3]:
import nltk
import string

nltk.download('stopwords')
nltk.download('punkt')

stopwords = nltk.corpus.stopwords.words('english')
punctuation = string.punctuation

print(stopwords[:5])
print(punctuation)

['i', 'me', 'my', 'myself', 'we']
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AkhilNikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AkhilNikhil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Pre procesing the data

In [4]:
from nltk.tokenize import word_tokenize

def pre_process(sms):
    no_punct="".join([word.lower() for word in sms if word not in punctuation])
    token=word_tokenize(no_punct)
    no_stopwords=[word for word in token if word not in stopwords]
    return no_stopwords

In [14]:
df['processed']=df['sms'].apply(lambda x:pre_process(x))
df['processed'].head()

0    [go, jurong, point, crazy, available, bugis, n...
1                       [ok, lar, joking, wif, u, oni]
2    [free, entry, 2, wkly, comp, win, fa, cup, fin...
3        [u, dun, say, early, hor, u, c, already, say]
4    [nah, dont, think, goes, usf, lives, around, t...
Name: processed, dtype: object

In [6]:
df.head()

Unnamed: 0,labels,sms,processed
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, goes, usf, lives, around, t..."


### As we can see the pre procesing of the data has been successful by removing all the stopwords and punctuations

# Categorizing the tokens 

In [7]:
def categorize():
    spam=[]
    ham=[]
    for sms in df['processed'][df['labels']=='spam']:
        for word in sms:
            spam.append(word)
    for sms in df['processed'][df['labels']=='ham']:
        for word in sms:
            ham.append(word)
            
    return spam,ham

spam,ham=categorize()
print(spam[:5])
print(ham[:5])

['free', 'entry', '2', 'wkly', 'comp']
['go', 'jurong', 'point', 'crazy', 'available']


# Predicting the message as spam or not

In [11]:
def predict(sms):
    scount=hcount=0
    for word in sms:
        scount+=spam.count(word)
        hcount+=ham.count(word)
    if hcount>scount:
        accuracy=round((hcount/(hcount+scount))*100,2)
        print('Message has been predicted as not spam with accuracy: {}%'.format(accuracy))
    elif scount>hcount:
        accuracy=round((scount/(hcount+scount))*100,2)
        print('Message has been predicted as spam with accuracy: {}%'.format(accuracy))
    else:
        print('Message could be a spam message with accuracy: 50%')


# Testing

In [18]:
text=input('Please enter a message:\n')
print('\n**********Results**********')
result=pre_process(text)
predict(result)

Please enter a message:
Thanks for your subscription to Ringtone UK your mobile will be charged £5/month Please confirm by replying YES or NO. If you reply NO you will not be charged

**********Results**********
Message has been predicted as spam with accuracy: 57.66%
