In [13]:
# Here use Natural Language processing
import re
import nltk
import pandas as pd
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [3]:
df = pd.read_csv("D:\MACHINE LEARNING\Machine Learning\CSV FILE\SMSSpamCollection",sep="\t",names=['level','message'])
df.head(10)

Unnamed: 0,level,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [4]:
# Data cleaning and preprocessing
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
corpus = []
corpus2 = []
# for stemming
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]'," ",df['message'][i])
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    corpus.append(review)
# for lemmatizing
for i in range(len(df)):
    review2 = re.sub('[^a-zA-Z]'," ",df['message'][i])
    review2 = review2.lower()
    review2 = review2.split()
    review2 = [lemmatizer.lemmatize(word) for word in review2 if word not in set(stopwords.words("english"))]
    review2 = " ".join(review2)
    corpus2.append(review2)

In [5]:
# Creating the bag of words model
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
Y = cv.fit_transform(corpus2).toarray()

In [6]:
# Creating the bag of words model by tf_idf model
tf_idf_vec = TfidfVectorizer()
X2 = tf_idf_vec.fit_transform(corpus).toarray()
Y2 = tf_idf_vec.fit_transform(corpus2).toarray()

In [7]:
print(X)
print("*"*50)
print(Y)
print("*"*50)
print(X2)
print("*"*50)
print(Y2)

  (0, 2171)	1
  (0, 2827)	1
  (0, 4091)	1
  (0, 1169)	1
  (0, 379)	1
  (0, 738)	1
  (0, 2245)	1
  (0, 6135)	1
  (0, 2932)	1
  (0, 736)	1
  (0, 964)	1
  (0, 2208)	1
  (0, 190)	1
  (0, 5957)	1
  (1, 3760)	1
  (1, 2960)	1
  (1, 2794)	1
  (1, 6056)	1
  (1, 3785)	1
  (2, 2007)	1
  (2, 1673)	2
  (2, 6101)	1
  (2, 1058)	1
  (2, 6067)	1
  (2, 1791)	2
  :	:
  (5567, 3365)	1
  (5567, 1573)	1
  (5568, 2171)	1
  (5568, 2457)	1
  (5568, 1996)	1
  (5568, 1704)	1
  (5569, 5236)	1
  (5569, 3439)	1
  (5569, 4044)	1
  (5570, 2007)	1
  (5570, 5992)	1
  (5570, 3042)	1
  (5570, 3614)	1
  (5570, 5804)	1
  (5570, 4986)	1
  (5570, 763)	1
  (5570, 1630)	1
  (5570, 2099)	1
  (5570, 2290)	1
  (5570, 2669)	1
  (5570, 52)	1
  (5570, 577)	1
  (5571, 3538)	1
  (5571, 5657)	1
  (5571, 4541)	1
**************************************************
  (0, 2472)	1
  (0, 3198)	1
  (0, 4593)	1
  (0, 1337)	1
  (0, 431)	1
  (0, 826)	1
  (0, 2545)	1
  (0, 6935)	1
  (0, 3311)	1
  (0, 824)	1
  (0, 1087)	1
  (0, 2508)	1
  (0, 210)	1

In [8]:
y = pd.get_dummies(df['level'])
y

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0
...,...,...
5567,0,1
5568,1,0
5569,1,0
5570,1,0


In [9]:
y['ham'].value_counts()

1    4825
0     747
Name: ham, dtype: int64

In [10]:
y['spam'].value_counts()

0    4825
1     747
Name: spam, dtype: int64

In [11]:
y = y.iloc[:,1].values
y

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [22]:
# split train , test
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2,random_state=1)
xtrain1,xtest1,ytrain1,ytest1 = train_test_split(Y,y,test_size=.2,random_state=1)
xtrain2,xtest2,ytrain2,ytest2 = train_test_split(X2,y,test_size=.2,random_state=1)
xtrain3,xtest3,ytrain3,ytest3 = train_test_split(Y2,y,test_size=.2,random_state=1)

In [23]:
# Training model using Naive bayes classifier
spam_detect_model = MultinomialNB().fit(xtrain,ytrain)
spam_detect_model1 = MultinomialNB().fit(xtrain1,ytrain1)
spam_detect_model2 = MultinomialNB().fit(xtrain2,ytrain2)
spam_detect_model3 = MultinomialNB().fit(xtrain3,ytrain3)

In [17]:
ypred = spam_detect_model.predict(xtest)
ypred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [28]:
ypred1 = spam_detect_model1.predict(xtest1)
ypred1

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [29]:
ypred2 = spam_detect_model2.predict(xtest2)
ypred2

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [30]:
ypred3 = spam_detect_model3.predict(xtest3)
ypred3

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [19]:
confus_matrix = confusion_matrix(ytest,ypred)
confus_matrix

array([[952,  16],
       [  4, 143]], dtype=int64)

In [31]:
confus_matrix1 = confusion_matrix(ytest1,ypred1)
confus_matrix1

array([[949,  19],
       [  6, 141]], dtype=int64)

In [33]:
confus_matrix2 = confusion_matrix(ytest2,ypred2)
confus_matrix2

array([[967,   1],
       [ 28, 119]], dtype=int64)

In [34]:
confus_matrix3 = confusion_matrix(ytest3,ypred3)
confus_matrix3

array([[967,   1],
       [ 27, 120]], dtype=int64)

In [18]:
# see accuracy
accuracy = accuracy_score(ytest,ypred)
print("Accuracy is: ",round(accuracy*100,ndigits=3),"%")

Accuracy is:  98.206 %


In [38]:
accuracy1 = accuracy_score(ytest1,ypred1)
print("Accuracy1 is: ",round(accuracy1*100,ndigits=3),"%")

Accuracy1 is:  97.758 %


In [39]:
accuracy2 = accuracy_score(ytest2,ypred2)
print("Accuracy2 is: ",round(accuracy2*100,ndigits=3),"%")

Accuracy2 is:  97.399 %


In [40]:
accuracy3 = accuracy_score(ytest3,ypred3)
print("Accuracy3 is: ",round(accuracy3*100,ndigits=3),"%")

Accuracy3 is:  97.489 %


In [21]:
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       968
           1       0.90      0.97      0.93       147

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.98      0.98      0.98      1115

