<a href="https://colab.research.google.com/github/anmolg1997/SpamDetector/blob/main/Spam_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Spam Detector


In [32]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import seaborn as sns

import random
import warnings
warnings.filterwarnings('ignore')

In [33]:
## Reading the given dataset
from google.colab import drive
drive.mount('/content/drive')

spam = pd.read_csv("drive/MyDrive/Colab Notebooks/NLP/P1 Spam Detector/SMSSpamCollection.txt", sep = "\t", names=["label", "message"])
print(spam.shape)
spam.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(5572, 2)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
## Converting the read dataset in to a list of tuples, each tuple(row) contianing the message and it's label

data_set = [[row.message, row.label] for index,row in spam.iterrows()]
data_set[:5]

[['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  'ham'],
 ['Ok lar... Joking wif u oni...', 'ham'],
 ["Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  'spam'],
 ['U dun say so early hor... U c already then say...', 'ham'],
 ["Nah I don't think he goes to usf, he lives around here though", 'ham']]

### Preprocessing

In [35]:
## initialise the inbuilt Stemmer and the Lemmatizer
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [36]:
def preprocess(document, stem):
  doc = document.lower() #lowercase
  doc = word_tokenize(doc) #tokenize
  doc = [word for word in doc if word not in stopwords.words('english')] #removing stopwords
  if stem:
    doc = [stemmer.stem(word) for word in doc] # Stemming
  else:
    doc = [wordnet_lemmatizer.lemmatize(word, pos = "v") for word in doc] # Stemming
  doc = " ".join(doc)
  return doc

In [37]:
ph=0
for index in data_set:
  data_set[ph][0] = [e for e in preprocess(index[0], stem=False).split() if len(e)>=3]
  data_set[ph] = tuple(data_set[ph])
  ph+=1
print(data_set[:5])


[(['jurong', 'point', 'crazy..', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham')]


### Creating feature List

In [38]:
def get_all_words(doc):
  all_words=[]
  for message,lable in doc:
    all_words.extend(message)
  return all_words


In [39]:
def feature_list(doc):
  word_dist = nltk.FreqDist(doc)
  feature_list = word_dist.keys()
  return feature_list

In [40]:
features = list(feature_list(get_all_words(data_set)))
print(len(features))

8395


### Creating Train & Test Set


In [41]:
## - creating slicing index at 80% threshold
sliceIndex = int((len(data_set)*.8))

In [42]:
## - shuffle the pack to create a random and unbiased split of the dataset
random.shuffle(data_set)

In [43]:
train_messages, test_messages = data_set[:sliceIndex], data_set[sliceIndex:]
print(f"total train docs are {len(train_messages)} and test messages are {len(test_messages)} in numbers.")

total train docs are 4457 and test messages are 1115 in numbers.


In [44]:
print(train_messages[:5])

[(['gibbs', 'unsold.mike', 'hussey'], 'ham'), (['except', 'theres', 'chick', 'huge', 'boob'], 'ham'), (['fear', 'faint', 'housework', 'quick', 'cuppa'], 'ham'), (['finish', 'work', 'yet'], 'ham'), (['arngd', 'marriage', 'walkin', 'unfortuntly', 'snake', 'bite', 'love', 'marriage', 'dance', 'frnt', 'snake', 'amp', 'sayin', 'bite', 'bite'], 'ham')]


In [45]:
## creating a LazyMap of feature presence for each of the 8K+ features with respect to each of the SMS messages
def extract_features(doc):
  doc_words = set(doc)
  lazy_features = {}
  for word in features:
    lazy_features[f'contains{word}'] = (word in doc_words)
  return lazy_features 


In [46]:
## - creating the feature map of train and test data
train_set = nltk.classify.apply_features(extract_features, train_messages)
test_set = nltk.classify.apply_features(extract_features, test_messages)

In [47]:
print('Training set size : ', len(train_set))
print('Test set size : ', len(test_set))

Training set size :  4457
Test set size :  1115


### Training

In [48]:
## Training the classifier with NaiveBayes algorithm
spamClassifier = nltk.NaiveBayesClassifier.train(train_set)

### Evaluation

In [49]:
## - Analyzing the accuracy of the train set
print(nltk.classify.accuracy(spamClassifier, train_set))

0.9914740857078752


In [50]:
## - Analyzing the accuracy of the test set
print(nltk.classify.accuracy(spamClassifier, test_set))


0.9802690582959641


In [51]:
## Testing a example message with our newly trained classifier
m = 'CONGRATULATIONS!! As a valued account holder you have been selected to receive a £900 prize reward! Valid 12 hours only.'
print('Classification result : ', spamClassifier.classify(extract_features(m.split())))

Classification result :  spam


In [52]:
## Priting the most informative features in the classifier
print(spamClassifier.show_most_informative_features(50))

Most Informative Features
           containsaward = True             spam : ham    =    192.3 : 1.0
           containsnokia = True             spam : ham    =    188.0 : 1.0
        containslandline = True             spam : ham    =    114.5 : 1.0
         containsservice = True             spam : ham    =    104.6 : 1.0
          containsurgent = True             spam : ham    =     89.8 : 1.0
            containscode = True             spam : ham    =     88.6 : 1.0
         containsattempt = True             spam : ham    =     75.6 : 1.0
             containstxt = True             spam : ham    =     75.6 : 1.0
          containslatest = True             spam : ham    =     68.7 : 1.0
           containsmusic = True             spam : ham    =     67.0 : 1.0
            containsrate = True             spam : ham    =     67.0 : 1.0
             contains100 = True             spam : ham    =     67.0 : 1.0
            containsdraw = True             spam : ham    =     62.0 : 1.0

In [53]:
## storing the classifier on disk for later usage
import pickle
f = open('nb_spam_classifier.pickle', 'wb')
pickle.dump(spamClassifier,f)
print('Classifier stored at ', f.name)
f.close()

Classifier stored at  nb_spam_classifier.pickle
