In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
#Data cleaning and preprocessing
import re    #Regular Expression
import nltk  #for tokenization,stemming,lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
#Loading Dataset
messages = pd.read_csv('/content/drive/My Drive/krish naik/NLP/Spam Classifier Project/SMSSpamCollection', sep='\t',
                           names=["label", "message"])

In [0]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [0]:
ps = PorterStemmer()   #Making object

In [0]:
corpus = []             #Making list
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])  #Stores only characters involving a-z and A-Z
    review = review.lower()                                    #Converts to lower case
    review = review.split()                                    #Stores the list in review
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]   #Removes stopwords
    review = ' '.join(review)                                  #Stores cleaned list in review
    corpus.append(review)                                      #Appending final list in corpus

In [0]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [0]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
y=pd.get_dummies(messages['label'])  #Creates 2 columns for ham and spam...so we gotta remove ham column
y=y.iloc[:,1].values           #Removes ham column...Representing 0 for ham and 1 for spam

In [0]:
y 

array([0, 0, 1, ..., 0, 0, 0], dtype=uint8)

In [0]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [0]:
# Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB     #Naive bayes works on basis of probability function...it works well for NLP or for multiple classes
spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred=spam_detect_model.predict(X_test)

In [0]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_pred)
acc=accuracy_score(y_test,y_pred)

In [0]:
cm

array([[946,   9],
       [  7, 153]])

In [0]:
acc

0.9856502242152466