<a href="https://colab.research.google.com/github/betheman/Natural-Language-Processing/blob/master/spam_detection_simple_naive_bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

In [34]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
messages= pd.read_csv('./spam.csv',encoding='latin-1')


In [0]:
df=pd.DataFrame()
df[['message','label']]=messages[['v2','v1']]

In [37]:
df.head()

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [0]:
import string

In [0]:
# Removing punctuation and stopwords

In [0]:
def message_cleaning(message):
    
    chars = [char for char in message if char not in string.punctuation]
    message = ''.join(chars)
    
    return [word for word in message.split() if word.lower() not in stopwords.words('english')]





In [0]:
df['message']=df['message'].apply(message_cleaning)

In [42]:
df.head()

Unnamed: 0,message,label
0,"[Go, jurong, point, crazy, Available, bugis, n...",ham
1,"[Ok, lar, Joking, wif, u, oni]",ham
2,"[Free, entry, 2, wkly, comp, win, FA, Cup, fin...",spam
3,"[U, dun, say, early, hor, U, c, already, say]",ham
4,"[Nah, dont, think, goes, usf, lives, around, t...",ham


In [0]:
# Performing Stemming

In [0]:
from nltk.stem import PorterStemmer

In [0]:
def word_stemming(message):
  stem_msg=[]
  ps = PorterStemmer()
  return [ps.stem(word) for word in message]
      
    




In [0]:
df['message']=df['message'].apply(word_stemming)

In [47]:
df.head()

Unnamed: 0,message,label
0,"[Go, jurong, point, crazi, avail, bugi, n, gre...",ham
1,"[Ok, lar, joke, wif, u, oni]",ham
2,"[free, entri, 2, wkli, comp, win, FA, cup, fin...",spam
3,"[U, dun, say, earli, hor, U, c, alreadi, say]",ham
4,"[nah, dont, think, goe, usf, live, around, tho...",ham


In [0]:
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(df['message'],df['label'], test_size=0.25, random_state=42)

In [0]:
# Creating word count dictionary according to label

In [0]:
ham_dict=dict()
spam_dict=dict()

In [0]:
def create_bag_of_words(messages,labels):
  for index in messages.index:
    label=labels[index]
    if label=='ham':
       for word in messages[index]:
         if word in ham_dict:
            ham_dict[word]=ham_dict[word]+1
         else:
            ham_dict[word]=1
    else:
       for word in messages[index]:
         if word in spam_dict:
            spam_dict[word]=spam_dict[word]+1
         else:
            spam_dict[word]=1





In [0]:
create_bag_of_words(X_train,y_train)


In [0]:
# naive bayes algorithm
# https://www.geeksforgeeks.org/naive-bayes-classifiers/

In [0]:
train_df=pd.DataFrame()
train_df['label']=y_train

In [0]:
total_count=len(train_df)
spam_count=len(train_df[train_df['label']=='spam'])
ham_count=len(train_df[train_df['label']=='ham'])

In [106]:
print(ham_count)
print(spam_count)

3623
556


In [0]:
def predict(messages):
    predictions=[]
    # calculating spam and ham probability
    p_spam=(spam_count/total_count)
    p_ham=(ham_count/total_count)
    for message in messages:
        coditional_probability_ham=p_ham
        coditional_probability_spam=p_spam
        for word in message:
          if word in ham_dict:
             coditional_probability_ham=coditional_probability_ham*(ham_dict[word]/ham_count)
          else:
             coditional_probability_ham=coditional_probability_ham*(1/ham_count)
          if word in spam_dict:
             coditional_probability_spam=coditional_probability_spam*(spam_dict[word]/spam_count)
          else:
             coditional_probability_spam=coditional_probability_spam*(1/spam_count)
        if coditional_probability_spam>coditional_probability_ham:
          predictions.append('spam')
        else:
          predictions.append('ham')
    return predictions









In [0]:
predictions=predict(X_test)

In [0]:
predictions=pd.Series(predictions)

In [0]:
from sklearn.metrics import confusion_matrix,classification_report

In [119]:
confusion_matrix(y_test,predictions)

array([[852, 350],
       [  2, 189]])

In [120]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         ham       1.00      0.71      0.83      1202
        spam       0.35      0.99      0.52       191

    accuracy                           0.75      1393
   macro avg       0.67      0.85      0.67      1393
weighted avg       0.91      0.75      0.79      1393

