#                                             SmsSpam_Filtering

In [None]:
#import Packages
import pandas as pd
import numpy as np


In [None]:
# Uploading file in Colab
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Personal_Projects/SMS_spam"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Personal_Projects/SMS_spam


In [None]:
#Load data
sms = pd.read_csv("SMSSpam", sep="\t", names=["Status","Message"])

In [None]:
#shape of data
sms.shape

(5572, 2)

In [None]:
#No. of columns
sms.columns

Index(['Status', 'Message'], dtype='object')

In [None]:
#print top 5 rows
sms.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#No. of spam
len(sms[sms.Status=="spam"])

747

In [None]:
#No. of ham
len(sms[sms.Status=="ham"])

4825

In [None]:
#Change ham to 1
#change spam to 0
sms.loc[sms["Status"]=="ham","Status"]=1
sms.loc[sms["Status"]=="spam","Status"]=0

In [None]:
#Print top 5 rows
sms.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
#Seperate Messages and status
sms_x = sms["Message"]
sms_y = sms["Status"]

In [None]:
#import packages 
import re #for regular espression
import nltk
from nltk.stem.porter import PorterStemmer #for stemming
nltk.download('stopwords')  #for stopword remove

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#data preprocessing part
corpus = []
for i in range(0, 5572):
    message = re.sub('[^a-zA-Z]', ' ', sms['Message'][i]) #substitute non alphabets with space
    message = message.lower()  #convert into lowercase
    message = message.split()  #split message
    ps = PorterStemmer()       #for stemming
    message = [ps.stem(word) for word in message if not word in set(stopwords.words('english'))]
    message = ' '.join(message) #join the individual word
    corpus.append(message)      

In [None]:
len(corpus)

5572

In [None]:
#top 5 row of message
for i in range(5):
    print(corpus[i])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat
ok lar joke wif u oni
free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli
u dun say earli hor u c alreadi say
nah think goe usf live around though


# Feature Extraction with CountVectorizer

In [None]:
#convert into vector
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = sms['Status']

In [None]:
X.shape

(5572, 6296)

In [None]:
X[0].reshape(-1,1)

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [None]:
#cv.get_feature_names()
cv.inverse_transform(X[0].reshape(-1,1))

[array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], dtype='<U34'),
 array([], d

In [None]:
y.shape

(5572,)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
#convert into int
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [None]:
#Naive bayes classifier with BernoulliNB
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
prediction = classifier.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9748803827751196


In [None]:
#Naive bayes classifier with MultinomialNB
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9796650717703349


In [None]:
#Naive bayes classifier with GaussianNB
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(pred,y_test))

0.8642344497607656


In [None]:
#KNN classifier with K=2
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_jobs=-1)
neigh.fit(X_train,y_train)
prediction = neigh.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9192583732057417


# Feature Extraction with TfidfVectorizer

In [None]:
#convert into vector
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(min_df=2)
X = cv.fit_transform(corpus).toarray()
y = sms['Status']

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [None]:
#convert into int
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [None]:
#Naive bayes classifier with BernoulliNB
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
prediction = classifier.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9838516746411483


In [None]:
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X_train,y_train)
prediction = neigh.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9168660287081339


In [None]:
#Naive bayes classifier with GaussianNB
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(pred,y_test))

0.8606459330143541
