<a href="https://colab.research.google.com/github/ankitdv98/SMS-spam-classifier/blob/main/SMS_Spam_Classiefier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [None]:
messages= pd.read_csv('SMSSpamCollection', sep= '\t',
                      names= ['label', 'message'])

In [None]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

#**Stemming with Bag of words**

In [None]:
#Data Cleaning
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps= PorterStemmer()
corpus= []

for i in range(len(messages)):
  review= re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review= review.lower()
  review= review.split()
  review= [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
  review= " ".join(review)
  corpus.append(review)
  

In [None]:
corpus[0:5]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though']

In [None]:
#Creating Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features= 5000)
X= cv.fit_transform(corpus).toarray()

In [None]:
X.shape

(5572, 5000)

In [None]:
y= pd.get_dummies(messages['label'])
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [None]:
y= y.iloc[:, 1].values
y[0:5]

array([0, 0, 1, 0, 0], dtype=uint8)

In [None]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 0)

In [None]:
#training model with Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model= MultinomialNB().fit(X_train, y_train)


In [None]:
y_pred= spam_detect_model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)

In [None]:
print(cm)

[[946   9]
 [  8 152]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9847533632286996

#**Lemmatization with Bag of Words**

In [None]:
#Data Cleaning
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

wordnet= WordNetLemmatizer()
corpus= []

for i in range(len(messages)):
  review= re.sub('[^a-zA-Z]', ' ', messages['message'][i])
  review= review.lower()
  review= review.split()
  review= [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  review= " ".join(review)
  corpus.append(review)

In [None]:
#Creating Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features= 5000)
X= cv.fit_transform(corpus).toarray()

y= pd.get_dummies(messages['label'])
y= y.iloc[:, 1].values

#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 0)

#training model with Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model= MultinomialNB().fit(X_train, y_train)

y_pred= spam_detect_model.predict(X_test)

from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)

In [None]:
print(cm)

[[944  11]
 [  9 151]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9820627802690582

#**Lemmatization with TFIDF**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv= TfidfVectorizer()
X= cv.fit_transform(corpus).toarray()

y= pd.get_dummies(messages['label'])
y= y.iloc[:, 1].values

#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 0)

#training model with Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
spam_detect_model= MultinomialNB().fit(X_train, y_train)

y_pred= spam_detect_model.predict(X_test)

from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)


In [None]:
print(cm)

[[955   0]
 [ 31 129]]


In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9721973094170404