In [None]:
import pandas as pd
import re
import xml.etree.ElementTree as ET
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
from copy import deepcopy
from itertools import chain
from sklearn.metrics import accuracy_score, classification_report

from google.colab import files, drive
drive.mount('/content/gdrive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def parsing_xml(data):
  root = ET.parse(data).getroot()
  list_ulasan,list_opini = [], []

  for review in root.findall('Review'):
    opini_review,ulasan_review = [], []
    for i in review.findall('./sentences/sentence'):
      teks = i.find('text').text
      opinions = i.find('./Opinions/Opinion')
      if teks is None or opinions is None:
        continue
      ulasan_review.append("".join(teks))
      opini_review.append({opinions.get('category'): opinions.get('polarity')})
    list_ulasan.append(ulasan_review)
    list_opini.append(opini_review)
  return list_ulasan, list_opini

In [None]:
def preprocessing(data):
  cleaned = []
  for i in range(len(data)):
    cleaned.append([])
    for j in data[i]:
      temp = (re.sub(r"[^a-zA-Z\s]+", "", j)).lower()
      temp = re.sub(r" +", " ", temp)
      cleaned[i].append(temp)
  return cleaned

In [None]:
def decontracted(ulasan):
  for i in ulasan:
    for j in i:
      j = re.sub(r"won't", "will not", j)
      j = re.sub(r"can\'t", "can not", j)
      j = re.sub(r"n\'t", " not", j)
      j = re.sub(r"\'re", " are", j)
      j = re.sub(r"\'s", " is", j)
      j = re.sub(r"\'d", " would", j)
      j = re.sub(r"\'ll", " will", j)
      j = re.sub(r"\'t", " not", j)
      j = re.sub(r"\'ve", " have", j)
      j = re.sub(r"\'m", " am", j)
  return ulasan

In [None]:
def postag(review):
  for i in review:
    for j in range(len(i)):
      i[j] = nltk.pos_tag(nltk.word_tokenize(i[j]))
  return review

In [None]:
def opini_rule(result_postag):
  grammar = "NP: {<DT|PP|CD|RB>?<JJ|JJR|JJS>*<NN|NNS|PRP|NNP|VB|IN|PRP\$>+<VBD|VBZ|VBN|VBP|VB|IN>*<JJ|JJS|RB>*<PRP|NN|NNS>*}"
  cp = nltk.RegexpParser(grammar)
  for i in result_postag:
    for j in range(len(i)):
      i[j] = cp.parse(i[j])
  return result_postag

In [None]:
def opini_extractor(result_rule):
  finish = []
  for result in range(len(result_rule)):
    temp = []
    finish.append([])
    for res in range(len(result_rule[result])):
      temp.append([])
      if type(result_rule[result][res]) == nltk.tree.Tree:
        for restu in result_rule[result][res]:
          if type(restu) == nltk.tree.Tree:
            for rest in restu:
              temp[res].append(rest[0])
          else:
            temp[res].append(restu[0])
      if len(temp[res]) >= 2:
        finish[result].append(" ".join(temp[res]))
  return finish

In [None]:
def find_in_list_of_list(mylist, char):
    for sub_list in mylist:
        if char in sub_list:
            return mylist.index(sub_list), sub_list.index(char)
    raise ValueError("'{char}' is not in list".format(char = char))

In [None]:
def class_extractor(list_ulasan_old, list_ulasan_new, list_opini):
  diff = []
  for i in range(len(list_ulasan_old)):
    for j in range(len(list_ulasan_old[i])):
      if list_ulasan_old[i][j] not in list_ulasan_new[i]:
        diff.append(list_ulasan_old[i][j])

  for i in diff:
    indA,indB = find_in_list_of_list(list_ulasan_old, i)
    del list_opini[indA][indB]
  return list_opini

In [None]:
def extractFeature(text,results=[]):
  for i in text:
    for j in i:
      for k in j.split():
        if k not in results:
          results.append(k)

  return results

In [None]:
def termFrequency(text,feature):
  results = []
  memory = []

  for i in range(len(text)):
    memory.append([])
    for j in text[i]:
      memory[i].append(j.split())
  for a in range(len(memory)):
    results.append([])
    for b in range(len(memory[a])):
      results[a].append([])
      results[a][b] = [memory[a][b].count(feature[i]) for i in range(len(feature))]
  
  return results

In [None]:
def devideClass(class_ListTest):
  aspectClass, sentimentClass = [], []
  for i in class_ListTest:
    for key, val in i.items():
      aspectClass.append(key)
      sentimentClass.append(val)
  return aspectClass, sentimentClass

In [None]:
def prior(docVal,docTarget, seekTarget=""):
  result = []
  totalLengthDoc = len(docVal)
  countTarget, countOther = 0,0
  for i in docTarget:
    for key, val in i.items():
      if key == seekTarget:
        countTarget+=1
      else:
        countOther+=1
  result.append(countTarget/totalLengthDoc)
  result.append(countOther/totalLengthDoc)
  return result

In [None]:
def wordCounter(tfDocTrain, docTrainTarget, seekTarget=""):
  allWords = [[],[]]
  for j in range(len(tfDocTrain[0])):
    countWordsMain = 0
    countWordsOther = 0
    for i in range(len(tfDocTrain)):
      for key, val in docTrainTarget[i].items():
        if key == seekTarget:
          countWordsMain+=tfDocTrain[i][j]
        else:
          countWordsOther+=tfDocTrain[i][j]
    allWords[0].append(countWordsMain)
    allWords[1].append(countWordsOther)
  
  return allWords

In [None]:
def likelihood(wordCounterVal, list_feature):
  perClass = [[],[]]
  for i in range(len(wordCounterVal)):
    for j in wordCounterVal[i]:
      likeliVal = (j + 1)/(sum(wordCounterVal[i]) + len(list_feature))
      perClass[i].append(likeliVal)
  return perClass

In [None]:
def naiveBayes(priorVal, likelihoodVal, tfTest):
  collector = []
  result = []

  #Collect all likelihood result if the words are in test
  for i in range(len(tfTest)):
    collector.append([])
    for j in range(len(tfTest[i])):
        if tfTest[i][j]!=0:
            collector[i].append(likelihoodVal[j])

  for i in collector:
    total = 1
    for j in i:
      total*=j
    result.append(priorVal*total)

  return result

In [None]:
def voteClass(firstClassResult, secondClassResult, targetClass):
  result = []
  for i in range(len(firstClassResult)):
    if firstClassResult[i]>secondClassResult[i]:
      result.append(targetClass)
    elif firstClassResult[i]<secondClassResult[i]:
      result.append("NON "+targetClass)
  return result

In [None]:
data_train = '...'
data_test = '...'

#Parsing from xml
list_ulasan, list_opini = parsing_xml(data_train)
list_ulasan_test, list_opini_test = parsing_xml(data_test)

In [None]:
#Preprocessing text
list_ulasan_prepos = preprocessing(list_ulasan)
list_ulasan_prepos_test = preprocessing(list_ulasan_test)

In [None]:
#Decontracted 
list_decontracted = decontracted(list_ulasan_prepos)
list_decontracted_test = decontracted(list_ulasan_prepos_test)
list_decontracted_copy = deepcopy(list_decontracted)
list_decontracted_test_copy = deepcopy(list_decontracted_test)

In [None]:
#Postag
list_postag = postag(list_decontracted)
list_postag_test = postag(list_decontracted_test)

In [None]:
#Opini Rule
list_opini_rule = opini_rule(list_postag)
list_opini_rule_uji = opini_rule(list_postag_test)



In [None]:
#Opinion Extractor
list_extractor = opini_extractor(list_opini_rule)
list_extractor_uji = opini_extractor(list_opini_rule_uji)

In [None]:
#Class Extractor
list_opini_new = class_extractor(list_decontracted_copy,list_extractor,list_opini)
list_opini_test_new = class_extractor(list_decontracted_test_copy,list_extractor_uji,list_opini_test)

In [None]:
#Extract Word as Feature
list_feature = extractFeature(list_extractor)
list_feature = extractFeature(list_extractor,list_feature)

In [None]:
#TF
list_tf = termFrequency(list_extractor,list_feature)
list_tf_uji = termFrequency(list_extractor_uji,list_feature)

In [None]:
#Flattening from 2D to 1D
termFreq = list(chain.from_iterable(list_tf))

termFreq_Test = list(chain.from_iterable(list_tf_uji))

class_list = list(chain.from_iterable(list_opini_new))

class_ListTest = list(chain.from_iterable(list_opini_test_new))

In [None]:
#Deviding list with class sentiment and aspect into two variable
aspectClassTest, sentimentClassTest = devideClass(class_ListTest)

#Change 'other class' to 'Non ....' 
mainTargetClass = "RESTAURANT#GENERAL"
aspectClassTestNew = ["NON "+mainTargetClass if x != mainTargetClass else mainTargetClass for x in aspectClassTest]
sentimentClassTestNew = ["NON "+mainTargetClass if x != mainTargetClass else mainTargetClass for x in sentimentClassTest]

In [None]:
#Counting prior
tyu = prior(termFreq,class_list, mainTargetClass)

In [None]:
#Count total word per class
tes = wordCounter(termFreq, class_list, mainTargetClass)

In [None]:
#Count likelihood
tesets = likelihood(tes,list_feature)

In [None]:
#Count naive bayes both class 
#First class = main class
#Second class = 'other' class
firstClassResult = naiveBayes(tyu[0],tesets[0], termFreq_Test)
secondClassResult = naiveBayes(tyu[1],tesets[1], termFreq_Test)

In [None]:
#Vote from both output to determine data test class
final = voteClass(firstClassResult,secondClassResult, mainTargetClass)

In [None]:
#Count the accuracy and other evaluation from the result
print(f"Accuracy : {accuracy_score(aspectClassTestNew, final) * 100} %\n") 
print(f"Classification Report : \n\n{classification_report(aspectClassTestNew, final)}") 

Accuracy : 84.83412322274881 %

Classification Report : 

                        precision    recall  f1-score   support

NON RESTAURANT#GENERAL       0.87      0.96      0.91       170
    RESTAURANT#GENERAL       0.70      0.39      0.50        41

              accuracy                           0.85       211
             macro avg       0.78      0.67      0.71       211
          weighted avg       0.83      0.85      0.83       211

