In [1]:
import pandas as pd
import numpy as np
import csv
import string
import re
import xml.etree.ElementTree as ET
import nltk
import math
from pathlib import Path
from collections import Counter
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
path_train = r'C:\Users\ASUS\Documents\GitHub\Skripsi-Aspect-Term-Extraction\Dataset\train.xml'
path_test =  r'C:\Users\ASUS\Documents\GitHub\Skripsi-Aspect-Term-Extraction\Dataset\test.xml'

### Parsing XML

In [3]:
def parsing_xml(path):
    tree = ET.parse(path)
    root = tree.getroot()
    list_ulasan = []
    list_opini = []
    for review in root.findall('Review'):
        for kalimat in review.findall('./sentences/sentence'):
            list_ulasan.append(kalimat.find('text').text)
    for opini in root.findall('Reciew/sentences/sentence'):
        opini_review = []
        for opinion in opini.findall('./Opinions/Opinion'):
            opinion_dict = {
                opinion.get('category'): opinion.get('polarity')
            }
            opini_review.append(opinion_dict)
        list_opini.append(opini_review)
    return list_ulasan, list_opini

In [4]:
parsing_xml(path_train)

(['Judging from previous posts this used to be a good place, but not any longer.',
  'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.',
  'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
  'The food was lousy - too sweet or too salty and the portions tiny.',
  'After all that, they complained to me about the small tip.',
  'Avoid this place!',
  'I have eaten at Saul, many times, the food is always consistently, outrageously good.',
  'Saul is the best restaurant on Smith Street and in Brooklyn.',
  'The duck confit is always amazing and the foie gras terrine with figs was out of this world.',
  'The wine list is interesting and has many good values.',
  'For the price, you cannot eat this well in Manhattan.',
  'I was very disappointed with this restaurant.',
  'Ive asked a cart attendant for a lotus leaf wrapped rice and 

In [5]:
def kelas_aspek(opini):
    opinion = []
    for opi in opini:
        for dictio in opi:
            for key in dictio:
                opinion.append(key)
    total_aspek = Counter(opinion)
    aspek = [a for a, b in total_aspek.most_common(5)]
    return aspek

def dataframe_ulasan(ulasan, opini, semua_aspek):
    data = {'Ulasan' : ulasan}
    dframe = pd.DataFrame(data)
    for idx, opi in enumerate(opini):
        for dictio in opi:
            for key in dictio:
                if key in semua_aspek:
                    dframe.loc[idx, key]= dictio[key]
    return dframe

# dframe.to_csv('data.csv', index = False)

In [6]:
# train_text_list, train_opinion_list = parsing_xml(path_train)
# semua_aspek = get_most_common_aspect(train_opinion_list)
# semua_aspek

In [7]:
# dframe = get_data_frame(train_text_list, train_opinion_list, semua_aspek)
# dframe

In [8]:
def case_folding(ulasan):
    folded = [x.lower() for x in ulasan]
    return folded

In [9]:
# dframe['Case Folding'] = case_folding(dframe['Review'])
# dframe

### Decontracted

In [10]:
def decontracted(ulasan):
    #implementation of words using regular expressions
    result = [re.sub(r"won't", "will not", ul) for ul in ulasan]
    result = [re.sub(r"can't", "can not", res) for res in result]
    result = [re.sub(r"'t", "not", res) for res in result]
    result = [re.sub(r"'re", "are", res) for res in result]
    result = [re.sub(r"'s", "is", res) for res in result]
    result = [re.sub(r"'d", "would", res) for res in result]
    result = [re.sub(r"'ll", "will", res) for res in result]
    result = [re.sub(r"'ve", "have", res) for res in result]
    result = [re.sub(r'[^\'m]', 'am', res) for res in result]
    return result

In [11]:
# def decontracted(opinion):
#     # specific
#     phrase = re.sub(r"won\'t", "will not", phrase)
#     phrase = re.sub(r"can\'t", "can not", phrase)

#     # general
#     phrase = re.sub(r"n\'t", " not", phrase)
#     phrase = re.sub(r"\'re", " are", phrase)
#     phrase = re.sub(r"\'s", " is", phrase)
#     phrase = re.sub(r"\'d", " would", phrase)
#     phrase = re.sub(r"\'ll", " will", phrase)
#     phrase = re.sub(r"\'t", " not", phrase)
#     phrase = re.sub(r"\'ve", " have", phrase)
#     phrase = re.sub(r"\'m", " am", phrase)
#     return phrase

In [12]:
# Tokenization

# def tokenization(text: str) -> list: 
#     result = [] 
#     for token in word_tokenize(text): 
#         if token.isalpha(): 
#             result.append(token) 
#     return result

In [13]:
# Lemmatization

# # def lemmatization(text: list) -> list: 
#     lemmatizer = WordNetLemmatizer() 
#     return [lemmatizer.lemmatize(word) for word in text] 

In [14]:
# Cleaning

# def clean(text: list, token_len = 1) -> list: 
#     result = [] 
#     for token in text: 
#         if len(token) > token_len and not token.isnumeric(): 
#             result.append(token.lower()) 
#     return result 

## Opinion Extractor
### Postaging

In [15]:
def posTag(ulasan):
    tagged_reviews = []
    for each_review_text in tqdm(ulasan):
        ulasan = nltk.word_tokenize(each_review_text)
        tagged_reviews.append(nltk.pos_tag(ulasan))
    return tagged_reviews

def postagUji(ulasan):
    for word in ulasan:
        ulasan = nltk.word_tokenize(word)
        tagged_reviews = nltk.post_tag(ulasan)
    return tagged_reviews

### Opinion Rule

In [16]:
# def filterTag(tagged_review):
#     final_text_list=[]
#     for text_list in tagged_review:
#         final_text=[]
#         for word,tag in text_list:
#             if tag in ['NN','NNS','NNP','NNPS','RB','RBR','RBS','JJ','JJR','JJS','VB','VBD','VBG','VBN','VBP','VBZ']:
#                 final_text.append(word)
#         final_text_list.append(' '.join(final_text))
#     return final_text_list

In [17]:
def opinion_rule(result_postag):
    results_tree = []
    grammar = "NP: {<DT|PP|CD|RB>?<JJ|JJR|JJS>*<NN|NNS|PRP|NNP|VB|IN|PRP\$>+<VBD|VBZ|VBN|VBP|VB|IN>*<JJ|JJS|RB>*<PRP|NN|NNS>*}"
    cp = nltk.RegexpParser(grammar)
    for tag in result_postag:
        results_tree.append(cp.parse(tag))
    return results_tree

def opini_rule_uji(result_postag):
    grammar = "NP: {<DT|PP|CD|RB>?<JJ|JJR|JJS>*<NN|NNS|PRP|NNP|VB|IN|PRP\$>+<VBD|VBZ|VBN|VBP|VB|IN>*<JJ|JJS|RB>*<PRP|NN|NNS>*}"
    cp = nltk.RegexpParser(grammar)
    results_tree = (cp.parse(result_postag))
    return results_tree

In [18]:
# dframe['Contraction'] = decontracted(dframe['Case Folding'])
# dframe

In [19]:
# dframe['tokenize'] = posTag(dframe['Contraction'])
# dframe

In [20]:
# dframe['OpRule'] = opinion_rule(dframe['tokenize'])
# dframe

### Opinion Extractor

In [21]:
def opini_extractor(results_rule):
    finish = []
    for result in range (len(results_rule[result])):
        temp = []
        finish.append([])
        for res in range (len(results_rule[result])):
            temp.append([])
            if type (results_rule[result][res]) == nltk.tree.Tree:
                for restu in results_rule[result][res]:
                    temp[res].append(restu[0])
            if len(temp[res]) >= 2:
                finish[result].append(" ".join(temp[res]))
    return finish

def opini_extractor_uji(result_rule):
     finish = []
     for res in result_rule:
        temp = []
        if type(res) == nltk.tree.Tree:
            for word in res:
                temp.append(word[0])
        if len (temp) >= 2:
            finish.append(" ".join(temp))
        return finish

## TF IDF

### Mengumpulkan Fitur Token

In [22]:
def getToken(ulasan):
    list_ulasan = ulasan
    token = [i.split() for i in ulasan]
    all_token = sorted(list(set([item for sublist in token for item in sublist])))
    return list_ulasan, token, all_token

def getTokenUji(ulasan, token):
    token_test = ulasan.split()
    token_test_filter = [tok for tok in token_test if tok in token]
    result_token = token + [token_test_filter]
    return result_token

# token_test_filter = [tok for tok in token_test if tok in all_token]

### Perhitungan TF IDF

In [23]:
def termWeighting(token, alltoken):
    termfreq = [[tok.count(alltok) for tok in token] for alltok in alltoken]
    docfreq = [sum(1 for tf in tfs if tf > 0 ) for tfs in termfreq]
    inversedf = [math.log10(len(tf) / df) for tf, df in zip(termfreq, docfreq)]
    tfxidf = [ [ (1 + math.log10(tf)) * inversedf if tf > 0 else tf for tf in termfreq] for termfreq, inversedf in zip(termfreq, inversedf) ]
    return inversedf, tfxidf

def termWeightingTest(new_token, alltoken, inversedf):
    termfreq = [[tok.count(alltok) for tok in new_token] for alltok in alltoken ]
    tfxidf = [[(1 + math.log10(tf)) * inversedf if tf > 0 else tf for tf in termfreq] for termfreq, inversedf in zip(termfreq, inversedf) ]
    return tfxidf

## Perhitungan Klasifikasi

### Perhitungan Nilai Kernel

In [24]:
def getKernelLinier(data, weight):
    kernel_data = np.zeros((len(data), len(data)))
    for i in tqdm(range(len(kernel_data))):
        for j in range(len(kernel_data)):
            jumlah = 0
            for k in range(len(weight)):
                jumlah += (weight[k][j] * weight[k][i])
            kernel_data[i][j] = jumlah
    return kernel_data

def getKernelLinierUji(data, datauji, weight):
    kernel_uji = np.zeros((len(data), len([datauji])))
    w_tranpose = np.array(weight).T
    w_train = w_tranpose[:-len([datauji])]
    w_test = w_tranpose[-len([datauji])]
    for i in range(len(w_train)):
        jumlah = 0
        for j in range(len(w_test)):
                jumlah += w_test[j] * w_train[i][j]
        kernel_uji[i][0] = jumlah
    return kernel_uji

### Perhitungan Matriks Hessian

In [25]:
def getMatrikHessian(kernel, lamb, y_train):
    hessian = np.zeros(kernel.shape)
    for i in tqdm(range(hessian.shape[0])):
        for j in range(hessian.shape[1]):
            hessian[i][j] = (y_train[i] * y_train[j]) * (kernel[i][j] + pow(lamb, 2))
    return hessian

### Sequential Learning

In [26]:
def seqLearning(hessian, gamma, C, maxIter):
    alpha = np.zeros(hessian.shape[0])
    error = np.zeros(hessian.shape[0])
    deltaError = np.zeros(hessian.shape[0])
    iter = 0
    
    while iter < maxIter:
        for i in range(hessian.shape[0]):
            error[i] = 0
            for j in range(hessian.shape[1]):
                error[i] += hessian[i][j] * alpha[i]
        for i in range(hessian.shape[0]):
            deltaError[i] = min(max((gamma * (1 - error[i])), -alpha[i]), C - alpha[i])
            alpha[i] = deltaError[i] + alpha[i]
            iter += 1
    return alpha

### Perhitungan Bias

In [27]:
def getBias(alpha, y_train, kernel):
    positif = alpha.tolist().index(max([data for idx, data in enumerate(alpha) if y_train[idx] == 1]))
    print("positif : ", positif)
    negatif = alpha.tolist().index(max([data for idx, data in enumerate(alpha) if y_train[idx] == -1]))
    print("negatif : ", negatif)
    kernel_pos = sum([alpha[i] * y_train[i] * kernel[i][positif] for i in range(len(y_train))])
    print("kernelpos : ", kernel_pos)
    kernel_neg = sum([alpha[i] * y_train[i] * kernel[i][negatif] for i in range(len(y_train))])
    print("kernelneg : ", kernel_neg)
    bias = -0.5 * (kernel_pos + kernel_neg)
    return bias

## Testing

In [28]:
def testingAspectOneData(alpha, bias, kerneluji, y_train):
    for i in range(kerneluji.shape[1]):
        jumlah = 0
        for j in range(kerneluji.shape[0]):
            jumlah += alpha[j] * y_train[j] * kerneluji[j][i]
    nilai_sentimen = jumlah + bias
    kelas_aspek = 1 if nilai_sentimen > 0 else -1
    return kelas_aspek

def testingSentimentOneData(alpha, bias, kerneluji, y_train):
    for i in range(kerneluji.shape[1]):
        jumlah = 0
        for j in range(kerneluji.shape[0]):
            jumlah += alpha[j] * y_train[j] * kerneluji[j][i]
    nilai_sentimen = jumlah + bias
    kelas_sentimen = "positive" if nilai_sentimen > 0 else "negative"
    return kelas_sentimen

def getNameCategory(data):
    if(1 not in data):
        pass
    else:
        res = data.index(1)
        if(res == 0):
            return "AMBIENCE#GENERAL"
        if(res == 1):
            return "DRINK#QUALITY"
        if(res == 2):
            return "FOOD#QUALITY"
        if(res == 3):
            return "RESTAURANT#GENERAL"
        if(res == 4):
            return "SERVICE#GENERAL"

## Main Training

In [29]:
def TrainingSVM():
    kernel = getKernelLinier(list_ulasan, x_train)
    hessian = getMatrikHessian(kernel, lamb, y_train)
    alpha = seqLearning(hessian, gamma, C, maxIter)
    bias = getBias(alpha, y_train, kernel)
    return alpha, bias

## Main Testing

In [30]:
def testingDataUji(data_uji, token):
    aspect_sentiment_uji = []
    uji = [data_uji]
    uji_prepos = preprocessing(uji)
    uji_decontracted = decontracted(uji_prepos)
    uji_postag = postagUji(uji_decontracted)
    uji_tree = opinion_rule_uji(uji_postag)
    uji_extract_opini = opini_extractor_uji(uji_tree)
    for opi in uji_extract_opini:
        token_uji = getTokenUji(opi, token)
        tfidf_uji = termWeightingTest(token_uji, all_token, inverse_docfreq)
        kernel_uji = getKernelLinierUji(list_ulasan, opi, tfidf_uji)
        result = getAspectSentimentTest(kernel_uji)
        for key, val in result.items():
            if key != None:
                aspect_sentiment_uji.append(result)
    return aspect_sentiment_uji                

## Perhitungan Evaluasi Setiap Kelas

In [31]:
def getEvaluation(y_actual, y_predicted):
    tp = sum(1 for x, y in zip(y_actual, y_predicted) if (x == 1) & (y == 1))
    tn = sum(1 for x, y in zip(y_actual, y_predicted) if (x == -1) & (y == -1))
    fn = sum(1 for x, y in zip(y_actual, y_predicted) if (x == 1) & (y == -1))
    fp = sum(1 for x, y in zip(y_actual, y_predicted) if (x == -1) & (y == 1))
print(tp, tn, fn, fp)

try:
    precision = tp/(tp+fp)
except ZeroDivisionError:
    precision = 0
try:
    recall = tp/(tp+fn)
except ZeroDivisionError:
    recall = 0
try:
    accuracy = ((tp+tn) / (tp + fp + tn + fn))
except ZeroDivisionError:
    accuracy = 0
try:
    f1 = (2 * precision * recall) / (precision + recall)
except ZeroDivisionError:
    f1 = 0

NameError: name 'tp' is not defined

## Perhitungan Rata-rata Evaluasi

In [None]:
tpMicro = tpr + tpf + tpd + tps + tpa
tnMicro = tnr + tnf + tnd + tns + tna
fpMicro = fpr + fpf + fpd + fps + fpa
fnMicro = fnr + fnf + fnd + fns + fna
precisionMacro = ((precision_ambience + precision_drink + precision_food + precision_restaurant + precision_service) / 5)
recallMacro = ((recall_ambience + recall_drink + recall_food + recall_restaurant + recall_service) / 5)
akurasiMacro = ((accuracy_ambience + accuracy_drink + accuracy_food + accuracy_restaurant + accuracy_service) / 5)
f1Macro = (2 * precisionMacro * recallMacro) / (precisionMacro + recallMacro)

precisionMicro = tpMicro/(tpMicro + fpMicro)
recallMicro = tpMicro / (tpMicro + fnMicro)
akurasiMicro = ((tpMiro + tnMicro) / (tpMicro + fpMicro + tnMicro + fnMicro))
f1Micro = (2* precisionMicro * recallMicro) / (precisionMicro + recallMicro)

print(precisionMacro, recallMacro, akurasiMacro, f1Macro)
print(precisionMicro, recallMicro, akurasiMicro, f1Micro)