<a href="https://colab.research.google.com/github/arijeetchoudhury100/sms_spam_detection/blob/master/spam_sms_detection_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import csv
import re
import string
import nltk

In [0]:
#load the data
data = []
labels = []
with open('spam.csv','r',encoding='latin-1') as csvfile:
  reader = csv.reader(csvfile,delimiter=',')
  for row in reader:
    labels.append(row[0])
    data.append(row[1])

In [110]:
print('No. of examples: ',len(data))

No. of examples:  5573


In [0]:
def convert_binary(x):
  if x == 'spam':
    return 1
  else:
    return 0

In [0]:
def remove_punctuation(text): 
    translator = str.maketrans(' ', ' ', string.punctuation) 
    return text.translate(translator)

In [0]:
def remove_whitespace(text): 
    return  " ".join(text.split()) 

In [0]:
#convert labels 'ham' and 'spam' to 0 and 1 respectively
blabels = list(map(convert_binary,labels))

In [116]:
#remove column headers
data = data[1:]
blabels = blabels[1:]
print(len(data))
print(len(blabels))
print(data[0])
print(blabels[0])

5572
5572
Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
0


In [0]:
#preprocess the messages

#convert messages to lowercase
data = [msg.strip().lower() for msg in data]

#convert links to 'httpaddr'
data = [re.sub('http[s]?://\S+', 'httpaddr',msg) for msg in data]

#convert all numbers to 'number'
data = [re.sub('[\d]+','number',msg) for msg in data]

#convert $ sign to 'dollar
data = [re.sub('[$]+','dollar',msg) for msg in data]

#convert email addresses to 'emailaddr'
data = [re.sub('\S+@\S+','emailaddr',msg) for msg in data]

#remove punctuations and extra whitespaces
data = [remove_punctuation(msg) for msg in data]
data = [remove_whitespace(msg) for msg in data]

In [118]:
print(len(data))
print(len(blabels))
print(data[0])
print(blabels[0])

5572
5572
go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat
0


In [0]:
from nltk.stem.porter import PorterStemmer
import nltk.corpus as nc

In [120]:
#download stopwords
nltk.download('stopwords')
stopwords = nc.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
def tokenize_and_stem(data,stopwords):
  '''
  This function tokenizes the data, removes stopwords and performs stemming
  parameters:
  data-> list of messages
  stopwords-> list of english stopwords
  '''
  tokenized_data = []
  stemmer = PorterStemmer()
  for msg in data:
    split_data = [word for word in msg.split() if word not in stopwords]
    stemmed_data = [stemmer.stem(split_word) for split_word in split_data]
    tokenized_data.append(stemmed_data)
  return tokenized_data

In [0]:
tokenized_data = tokenize_and_stem(data,stopwords)

In [123]:
print(len(tokenized_data))
print(tokenized_data[0])
print(blabels[0])

5572
['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']
0


In [0]:
def join_words(tokenized_data):
  tokenized_text = []
  for tokenized_sentence in tokenized_data:
    text = " ".join([word for word in tokenized_sentence])
    tokenized_text.append(text)
  return tokenized_text

In [0]:
tokenized_text = join_words(tokenized_data)

In [126]:
print(len(tokenized_text))
print(tokenized_text[0])
print(blabels[0])

5572
go jurong point crazi avail bugi n great world la e buffet cine got amor wat
0


In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#extract features from the text using TF-IDF vectors
tfv = TfidfVectorizer()
tfv.fit(tokenized_text)

In [0]:
X = tfv.transform(tokenized_text).toarray() #final data

In [130]:
print(X.shape)

(5572, 7302)


In [0]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import cohen_kappa_score

In [132]:
#split data into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,blabels,test_size=0.3)
print('No. of training examples: ',len(X_train))
print('No. of tesitng examples: ',len(X_test))
print(X_train.shape)

No. of training examples:  3900
No. of tesitng examples:  1672
(3900, 7302)


In [133]:
#train the SVM model
model1 = svm.SVC()
model1.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [0]:
#perform predictions
predictions = model1.predict(X_test)

In [135]:
p = predictions == y_test
print('Accuracy: ',len(p[p == True])/len(p))

Accuracy:  0.9766746411483254


In [136]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1438
           1       0.99      0.85      0.91       234

    accuracy                           0.98      1672
   macro avg       0.98      0.92      0.95      1672
weighted avg       0.98      0.98      0.98      1672



In [137]:
print('Confusion matrix: ')
print(confusion_matrix(y_test,predictions))
print

Confusion matrix: 
[[1435    3]
 [  36  198]]


In [140]:
print(precision_recall_fscore_support(y_test,predictions,average='macro'))
print(cohen_kappa_score(y_test,predictions))

(0.9803007396734884, 0.9220338076388146, 0.9484690793139009, None)
0.8970267760701901


In [141]:
model2 = MultinomialNB(alpha=0.1)
model2.fit(X_train,y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [142]:
y_pred = model2.predict(X_test)
print(cohen_kappa_score(y_test,y_pred))

0.9178659680252463


In [143]:
print(confusion_matrix(y_test,y_pred))

[[1422   16]
 [  17  217]]


In [144]:
print(confusion_matrix(y_test,predictions))

[[1435    3]
 [  36  198]]


In [146]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1438
           1       0.93      0.93      0.93       234

    accuracy                           0.98      1672
   macro avg       0.96      0.96      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [147]:
model3 = svm.SVC(kernel='linear')
model3.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [148]:
pp = model3.predict(X_test)
print(cohen_kappa_score(y_test,pp))

0.9141742522756827


In [149]:
print(classification_report(y_test,pp))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1438
           1       0.98      0.88      0.93       234

    accuracy                           0.98      1672
   macro avg       0.98      0.94      0.96      1672
weighted avg       0.98      0.98      0.98      1672



In [150]:
print(confusion_matrix(y_test,pp))

[[1434    4]
 [  29  205]]
