#Import Dependencies

In [None]:
import re
import math
import string
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from collections import Counter, defaultdict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from statistics import mean

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Import Data

In [None]:
df = pd.read_excel("Data_selected.xlsx")

#Preprocessing Data

In [None]:
class Tokenizer:
  
  def clean(self,text):
      #1. Mengubah kata menjadi huruf kecil
      text = text.lower()
      #2. Lexical Analysis
      text = re.sub("-"," ", text)
      text = [word.strip(string.punctuation) for word in text.split(" ")]
      text = [word for word in text if not any(c.isdigit() for c in word)]
      #4. Stopword Removal
      stopword = stopwords.words('indonesian') #mendeteksi stopwords dalam bahasa indonesia
      text = [x for x in text if x not in stopword]
      #5. Stemming
      factory = StemmerFactory()
      stemmer = factory.create_stemmer()
      text = [stemmer.stem(word) for word in text]
      #6. Menghapus kata yang hanya memiliki 1 huruf
      text = [t for t in text if len(t) > 1]
      #7. Gabungkan semuanya
      text = " ".join(text)
      return text

 
  def tokenize(self, text):
      clean = self.clean(text)
      stopwords_en = stopwords.words("english")
      return [w for w in re.split("\W+", clean) if not w in stopwords_en]

#Build Model

In [None]:
class MultinomialNaiveBayes:

    def __init__(self, classes, tokenizer):
      self.tokenizer = tokenizer
      self.classes = classes
      
    def group_by_class(self, X, y):
      data = dict()
      for c in self.classes:
        data[c] = X[np.where(y == c)]
      return data
           
    def fit(self, X, y):
        self.n_class_items = {}
        self.class_priors = {}
        self.raw_tf = {}
        self.term = set()

        n = len(X)
        
        grouped_data = self.group_by_class(X, y)
        
        for c, data in grouped_data.items():
          self.n_class_items[c] = len(data)
          self.class_priors[c] = self.n_class_items[c] / n
          self.raw_tf[c] = defaultdict(lambda: 0)
          
          for text in data:
            counts = Counter(self.tokenizer.tokenize(text))
            for word, count in counts.items():
                if word not in self.term:
                    self.term.add(word)

                self.raw_tf[c][word] += count
                
        return self
      
    def conditional_probability(self, word, text_class):
      num = self.raw_tf[text_class][word] + 1
      denom = self.n_class_items[text_class] + len(self.term)
      return num / denom
      
    def predict(self, X):
        result = []
        for text in X:
          
          class_scores = {c: self.class_priors[c] for c in self.classes}

          words = set(self.tokenizer.tokenize(text))
          for word in words:
              if word not in self.term: continue

              for c in self.classes:
                
                w_given_c = self.conditional_probability(word, c)
                class_scores[c] *= w_given_c
                
          result.append(max(class_scores, key=class_scores.get))

        return result

# Cross Validation

In [None]:
X = df['tweet'].values
y = df['label'].values

In [None]:
def stratifiedKFold(X,y,n_splits=5):
  skf = StratifiedKFold(n_splits)
  lst_accu_stratified = []
  NB = MultinomialNaiveBayes(
              classes=np.unique(y), 
              tokenizer=Tokenizer()
            )
  for train_index, test_index in skf.split(X,y): 
      print("Train:", train_index, "Validation:", test_index) 
      X_train, X_test = X[train_index], X[test_index] 
      y_train, y_test = y[train_index], y[test_index]
      NB.fit(X_train, y_train)
      y_pred = NB.predict(X_test)
      print('Predicted: {}'.format(y_pred))
      print('Actual: {}\n'.format(y_test))
      lst_accu_stratified.append(accuracy_score(y_test, y_pred))
  return lst_accu_stratified,NB

In [None]:
lst_accu_stratified,NB = stratifiedKFold(X,y)
print('\nList of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
          max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
          min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
          mean(lst_accu_stratified)*100, '%')

Train: [ 3  4  5  6  7  8  9 10 11 12 13 14 18 19 20 21 22 23 24 25 26 27 28 29] Validation: [ 0  1  2 15 16 17]
Predicted: [0, 0, 0, 0, 0, 0]

Actual: [1 1 1 0 0 0]

Train: [ 0  1  2  6  7  8  9 10 11 12 13 14 15 16 17 21 22 23 24 25 26 27 28 29] Validation: [ 3  4  5 18 19 20]
Predicted: [0, 1, 0, 0, 0, 0]

Actual: [1 1 1 0 0 0]

Train: [ 0  1  2  3  4  5  9 10 11 12 13 14 15 16 17 18 19 20 24 25 26 27 28 29] Validation: [ 6  7  8 21 22 23]
Predicted: [0, 1, 0, 0, 0, 0]

Actual: [1 1 1 0 0 0]

Train: [ 0  1  2  3  4  5  6  7  8 12 13 14 15 16 17 18 19 20 21 22 23 27 28 29] Validation: [ 9 10 11 24 25 26]
Predicted: [1, 1, 1, 0, 0, 0]

Actual: [1 1 1 0 0 0]

Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 15 16 17 18 19 20 21 22 23 24 25 26] Validation: [12 13 14 27 28 29]
Predicted: [1, 1, 0, 0, 0, 0]

Actual: [1 1 1 0 0 0]


List of possible accuracy: [0.5, 0.6666666666666666, 0.6666666666666666, 1.0, 0.8333333333333334]

Maximum Accuracy That can be obtained from this model is: 100.0 %

In [None]:
tes = ["Terima Kasih Mobile JKN atas layanan nya...??????"]

In [None]:
doc_pred = NB.predict(tes)
doc_pred

[1]