In [38]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
cd /content/drive/My Drive/NLP/smsspamcollection

/content/drive/My Drive/NLP/smsspamcollection


In [0]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [0]:
def getMessages():
  messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])
  return messages

In [0]:
def populateCorpus():
  ps = PorterStemmer()
  corpus = []
  for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
  return corpus

In [0]:
def populateBagOfWords(corpus):
  cv = CountVectorizer(max_features=2500)
  X = cv.fit_transform(corpus).toarray()

  y=pd.get_dummies(messages['label'])
  y=y.iloc[:,1].values
  return X, y

In [0]:
def extractTrainAndTestData(X, y):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
  return X_train, X_test, y_train, y_test

In [0]:
def implementNaiveBayesModel(X_train, y_train, X_test):
  spam_detect_model = MultinomialNB().fit(X_train, y_train)
  y_pred = spam_detect_model.predict(X_test)
  return y_pred

In [0]:
def calculateConfusionMatrix(y_test, y_pred):
  confusion_m = confusion_matrix(y_test, y_pred)
  return confusion_m

In [0]:
def calculateAccuracy(y_test, y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  return accuracy

In [0]:
def process():
  messages = getMessages()
  corpus = populateCorpus()
  X, y = populateBagOfWords(corpus)
  X_train, X_test, y_train, y_test = extractTrainAndTestData(X, y)
  y_pred = implementNaiveBayesModel(X_train, y_train, X_test)
  confusion_m = calculateConfusionMatrix(y_test, y_pred)
  accuracy = calculateAccuracy(y_test, y_pred)
  return confusion_m, accuracy

In [0]:
def main():
  confusion_m, accuracy = process()
  print('Confusion Matrix is : ')
  print(confusion_m)
  print('Accuracy is : ')
  print(accuracy)
  return

In [0]:
if __name__ == "__main__":
    main()

Confusion Matrix is : 
[[946   9]
 [  7 153]]
Accuracy is : 
0.9856502242152466
