In [53]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [54]:
data=pd.read_csv('sentences-data.csv')
data

Unnamed: 0,no,sentence,category
0,1,butuh dana segar hubungi KSP Maju Mundur,spam
1,2,selamat!! Anda mendapat satu hektar sawah,spam
2,3,"selamat pagi, servisan motor Anda sudah selesai",nonspam
3,4,"Andi, jangan lupa berdoa",nonspam


In [55]:
data["category"].value_counts()

spam       2
nonspam    2
Name: category, dtype: int64

In [56]:
data["category"].replace({"spam":"1", "nonspam":"0"}, inplace=True)
data

Unnamed: 0,no,sentence,category
0,1,butuh dana segar hubungi KSP Maju Mundur,1
1,2,selamat!! Anda mendapat satu hektar sawah,1
2,3,"selamat pagi, servisan motor Anda sudah selesai",0
3,4,"Andi, jangan lupa berdoa",0


In [57]:
# Randomize the dataset
data_randomized = data.sample(frac=1, random_state=1)

# Calculate index for split
training_test_index = round(len(data_randomized) * 0.7)

# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)

print(training_set.shape)
print(test_set.shape)

(3, 3)
(1, 3)


In [58]:
training_set['category'].value_counts

<bound method IndexOpsMixin.value_counts of 0    0
1    0
2    1
Name: category, dtype: object>

In [59]:
test_set['category'].value_counts

<bound method IndexOpsMixin.value_counts of 0    1
Name: category, dtype: object>

In [60]:
training_set['sentence'] = training_set['sentence'].str.replace('\W', ' ')
training_set['sentence'] = training_set['sentence'].str.lower()
training_set

Unnamed: 0,no,sentence,category
0,4,andi jangan lupa berdoa,0
1,3,selamat pagi servisan motor anda sudah selesai,0
2,1,butuh dana segar hubungi ksp maju mundur,1


In [61]:
training_set['sentence'] = training_set['sentence'].str.split()

vocabulary = []
for sentence in training_set['sentence']:
   for word in sentence:
      vocabulary.append(word)

vocabulary = list(set(vocabulary))

In [62]:
len(vocabulary)

18

In [63]:
word_counts_per_sentence = {unique_word: [0] * len(training_set['sentence']) for unique_word in vocabulary}

for index, sentence in enumerate(training_set['sentence']):
   for word in sentence:
      word_counts_per_sentence[word][index] += 1

In [65]:
word_counts = pd.DataFrame(word_counts_per_sentence)
word_counts.head()

Unnamed: 0,lupa,pagi,selesai,dana,ksp,sudah,selamat,hubungi,berdoa,motor,segar,maju,jangan,butuh,servisan,andi,anda,mundur
0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0
1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,1,0,1,0
2,0,0,0,1,1,0,0,1,0,0,1,1,0,1,0,0,0,1


In [71]:
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()

Unnamed: 0,no,sentence,category,lupa,pagi,selesai,dana,ksp,sudah,selamat,...,berdoa,motor,segar,maju,jangan,butuh,servisan,andi,anda,mundur
0,4,"[andi, jangan, lupa, berdoa]",0,1,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
1,3,"[selamat, pagi, servisan, motor, anda, sudah, ...",0,0,1,1,0,0,1,1,...,0,1,0,0,0,0,1,0,1,0
2,1,"[butuh, dana, segar, hubungi, ksp, maju, mundur]",1,0,0,0,1,1,0,0,...,0,0,1,1,0,1,0,0,0,1


In [72]:
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['category'] == '1']
ham_messages = training_set_clean[training_set_clean['category'] == '0']

# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)

# N_Spam
n_words_per_spam_message = spam_messages['sentence'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['sentence'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

In [73]:
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}

# Calculate parameters
for word in vocabulary:
   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
   p_word_given_spam = 1/((n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary))
   parameters_spam[word] = p_word_given_spam

   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
   p_word_given_ham = 1/((n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary))
   parameters_ham[word] = p_word_given_ham

In [74]:
import re

def classify(sentence):

   sentence = re.sub('\W', ' ', sentence)
   sentence = sentence.lower().split()

   p_spam_given_sentence = p_spam
   p_ham_given_sentence = p_ham

   for word in sentence:
      if word in parameters_spam:
         p_spam_given_sentence *= parameters_spam[word]

      if word in parameters_ham: 
         p_ham_given_sentence *= parameters_ham[word]

   print('P(Spam|sentence):', p_spam_given_sentence)
   print('P(Ham|sentence):', p_ham_given_sentence)

   if p_ham_given_sentence > p_spam_given_sentence:
      print('category: spam')
   elif p_ham_given_sentence < p_spam_given_sentence:
      print('Label: nonspam')
   else:
      print('Equal proabilities, have a human classify this!')

In [75]:
classify('Pinjaman online bunga ringan hanya di KSP Widyatama')

P(Spam|sentence): 4.166666666666666
P(Ham|sentence): 19.333333333333332
category: spam


In [77]:
classify('dik, pulang. jangan lupa pesenan ibu')

P(Spam|sentence): 208.33333333333331
P(Ham|sentence): 140.16666666666666
Label: nonspam


In [78]:
def classify_test_set(sentence):

   sentence = re.sub('\W', ' ', sentence)
   sentence = sentence.lower().split()

   p_spam_given_sentence = p_spam
   p_ham_given_sentence = p_ham

   for word in sentence:
      if word in parameters_spam:
         p_spam_given_sentence *= parameters_spam[word]

      if word in parameters_ham:
         p_ham_given_sentence *= parameters_ham[word]

   if p_ham_given_sentence > p_spam_given_sentence:
      return 'spam'
   elif p_spam_given_sentence > p_ham_given_sentence:
      return 'ham'
   else:
      return 'needs human classification'

In [80]:
test_set['predicted'] = test_set['sentence'].apply(classify_test_set)
test_set.head()

Unnamed: 0,no,sentence,category,predicted
0,2,selamat!! Anda mendapat satu hektar sawah,1,ham


In [81]:
correct = 0
total = test_set.shape[0]

for row in test_set.iterrows():
   row = row[1]
   if row['category'] == row['predicted']:
      correct += 1

print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)

Correct: 0
Incorrect: 1
Accuracy: 0.0
