In [1]:
import numpy as np
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
# remove the empty valued columns
data = data.drop(data.columns[[2, 4, 3]], axis=1)       
# rename column name v1 with spam and v2 with email
data.rename(columns={'v1': 'spam', 'v2': 'email'}, inplace=True)  
# replace all spam with 1
data.loc[data['spam']=='spam', 'spam'] = 1          
# replace all ham with 0
data.loc[data['spam']=='ham', 'spam'] = 0      
# interchange columns
data = data[['email', 'spam']]                          

data.head()

Unnamed: 0,email,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
from sklearn.model_selection import train_test_split
x = data["email"]
y = data["spam"]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.33)

#variables, dictionary, list used
positiveTotal = 0
negativeTotal = 0
pA = 0
pNotA = 0
trainPositive = {}
trainNegative = {}
totalWords = []
alpha = 1

In [5]:
def process(text):   
    # lowercase it 
    text = text.lower()           
    # remove punctuation  
    text = ''.join([t for t in text if t not in string.punctuation])       
    # remove stopwords  
    text = [t for t in text.split() if t not in stopwords.words('english')]   
    # stemming
    st = Stemmer()                                                             
    text = [st.stem(t) for t in text]   
    # return token list
    return text                                                                

#reading words from a specific email
def processEmail(body, label):
    global positiveTotal, negativeTotal, totalWords
    body = process (body)
    for word in body:
        if label == 1:
            trainPositive[word] = trainPositive.get(word, 0) + 1
            positiveTotal += 1
        else:
             trainNegative[word] = trainNegative.get(word, 0) + 1             
             negativeTotal += 1  
        totalWords.append(word)

def train():  
  global pA, pNotA  
  total = 0
  numSpam = 0
  for email,spam in zip(x_train,y_train):
     if spam == 1 :           
       numSpam +=1
     total += 1
     processEmail(email, spam)
     pA = numSpam/float(total)
     pNotA = 1-pA
    
train()    

In [6]:
#input text to classify spam/ham
text = input("Enter the Text: ")                                  
     
#gives the conditional probability p(B_i/A_x)
def conditionalWord(word, spam):   
    global alpha, numWords
    if spam:
       return (trainPositive.get(word,0)+alpha)/(float)(positiveTotal+alpha*numWords)
    return (trainNegative.get(word,0)+alpha)/(float)(negativeTotal+alpha*numWords)

def conditionalEmail(body, spam) :    
  result =1.0
  for word in body:
    result *= conditionalWord(word, spam)
  return result

#classifies a new email as spam or not spam
def classify(email):
  global pA, pNotA, numWords, totalWords  
  numWords=len(set(totalWords))
  isSpam = pA * conditionalEmail(email, True)        # P(A|B)
  notSpam = pNotA * conditionalEmail(email, False)   # P(¬A|B)  
  return isSpam > notSpam
  
x=classify(text)

if(x==1):
    print ("The text entered is Spam")
else:
    print ("The text entered is Not Spam")


Enter the Text: Sign up today
The text entered is Spam
