### Simple binary text classifier using Naive Bayes ###

#### Data Reference - https://archive.ics.uci.edu/dataset/228/sms+spam+collection
#### Code Reference - https://youtu.be/fA5TSFELkC0?si=AKssaeknNcwuJpwi 


In [None]:
import nltk
nltk.download('stopwords')

In [2]:
import pandas as pd

In [10]:
import os
import sys
import re

In [4]:
sys.path.insert(0,'./sms+spam+collection/')

In [6]:
os.listdir('./sms+spam+collection/')

['readme', 'SMSSpamCollection']

In [7]:
msgs = pd.read_csv('./sms+spam+collection/SMSSpamCollection',sep="\t",names=['label','msg'])

In [8]:
msgs.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
from nltk.corpus import stopwords

In [12]:
from nltk.stem.porter import PorterStemmer

In [13]:
ps = PorterStemmer()

In [14]:
len(msgs)

5572

#### Data cleaning involves following steps - 
##### 1. Removing any non alphabet characters 
##### 2. Convert everything to lower case
##### 3. Split sentence into words
##### 4. Remove stopwords and fetch root of word using PorterStemmer

In [15]:
corpus = []


In [16]:
for i in range(len(msgs)):
    review = re.sub('[^a-zA-Z]',' ',msgs["msg"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [19]:
corpus[10]

'gonna home soon want talk stuff anymor tonight k cri enough today'

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000)

In [22]:
x = cv.fit_transform(corpus).toarray()

In [23]:
x.shape

(5572, 5000)

#### Use pandas get dummies to convert label text to binary values
#### Retain only one column to indicate labels <0,1>


In [24]:
y = pd.get_dummies(msgs['label'])

In [29]:
y = y.iloc[:,1].values

In [30]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.20,random_state=0)

In [31]:
from sklearn.naive_bayes import MultinomialNB
mb = MultinomialNB()

In [32]:
mb.fit(xtrain,ytrain)

In [33]:
ypred=mb.predict(xtest)

In [34]:
from sklearn.metrics import confusion_matrix

In [38]:
confusion = confusion_matrix(ytest,ypred)

In [40]:
confusion

array([[946,   9],
       [  8, 152]])

In [41]:
tn, fp, fn, tp =confusion.ravel()

In [42]:
tn

946

In [43]:
fp

9

In [44]:
fn

8

In [45]:
tp

152