### Library

In [274]:
import pandas as pd
import numpy as np
import csv
import string
import re
import xml.etree.ElementTree as ET
import nltk
import math
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.base import BaseEstimator, ClassifierMixin, clone

In [275]:
# Import Data
data_train = r'C:\Users\ASUS\Documents\GitHub\Skripsi-Aspect-Term-Extraction\Dataset\train.xml'
data_test =  r'C:\Users\ASUS\Documents\GitHub\Skripsi-Aspect-Term-Extraction\Dataset\test.xml'

### Parsing XML

In [276]:
def parsing_xml(data):
    tree=ET.parse(data)
    root=tree.getroot()
    print(root)
    text_list=[]
    opinion_list=[]
    for review in root.findall("Review"):
        for text_string in review.findall("./sentences/sentence"):
            text_list.append(text_string.find('text').text)
        for opinion in review.findall("./sentences/sentence"):
            opinion_review=[]
            for opinion in opinion.findall('./Opinions/Opinion'):
                opini_dict={
                    opinion.get('category'):opinion.get('polarity')
                }
                opinion_review.append(opini_dict)
            opinion_list.append(opinion_review)
    return text_list, opinion_list

In [277]:
data_review, train_opinion = parsing_xml(data_train)
data_review

<Element 'Reviews' at 0x000002404DC1CB80>


['Judging from previous posts this used to be a good place, but not any longer.',
 'We, there were four of us, arrived at noon - the place was empty - and the staff acted like we were imposing on them and they were very rude.',
 'They never brought us complimentary noodles, ignored repeated requests for sugar, and threw our dishes on the table.',
 'The food was lousy - too sweet or too salty and the portions tiny.',
 'After all that, they complained to me about the small tip.',
 'Avoid this place!',
 'I have eaten at Saul, many times, the food is always consistently, outrageously good.',
 'Saul is the best restaurant on Smith Street and in Brooklyn.',
 'The duck confit is always amazing and the foie gras terrine with figs was out of this world.',
 'The wine list is interesting and has many good values.',
 'For the price, you cannot eat this well in Manhattan.',
 'I was very disappointed with this restaurant.',
 'Ive asked a cart attendant for a lotus leaf wrapped rice and she replied b

In [278]:
train_opinion

[[{'RESTAURANT#GENERAL': 'negative'}],
 [{'SERVICE#GENERAL': 'negative'}],
 [{'SERVICE#GENERAL': 'negative'}],
 [{'FOOD#QUALITY': 'negative'}, {'FOOD#STYLE_OPTIONS': 'negative'}],
 [{'SERVICE#GENERAL': 'negative'}],
 [{'RESTAURANT#GENERAL': 'negative'}],
 [{'FOOD#QUALITY': 'positive'}],
 [{'RESTAURANT#GENERAL': 'positive'}],
 [{'FOOD#QUALITY': 'positive'}, {'FOOD#QUALITY': 'positive'}],
 [{'DRINKS#STYLE_OPTIONS': 'positive'}, {'DRINKS#PRICES': 'positive'}],
 [{'RESTAURANT#PRICES': 'positive'}, {'FOOD#QUALITY': 'positive'}],
 [{'RESTAURANT#GENERAL': 'negative'}],
 [{'SERVICE#GENERAL': 'negative'}],
 [{'SERVICE#GENERAL': 'negative'}],
 [{'FOOD#QUALITY': 'neutral'}],
 [{'FOOD#QUALITY': 'negative'},
  {'FOOD#QUALITY': 'negative'},
  {'RESTAURANT#MISCELLANEOUS': 'negative'}],
 [{'RESTAURANT#GENERAL': 'negative'}],
 [{'RESTAURANT#GENERAL': 'positive'}],
 [{'SERVICE#GENERAL': 'positive'},
  {'FOOD#QUALITY': 'positive'},
  {'RESTAURANT#PRICES': 'positive'}],
 [{'RESTAURANT#GENERAL': 'positive'

In [279]:
def get_most_common_aspect(opinion_list):
    opinion = []
    for inner_list in opinion_list:
        for dictio in inner_list:
            for key in dictio:
                opinion.append(key)
    most_common_aspect = [a for a, b in nltk.FreqDist(opinion).most_common(5)]
    return most_common_aspect

In [280]:
def get_data_frame(text_list, opinion_list, most_common_aspect):
    data = {'Review' : text_list}
    df = pd.DataFrame(data)
    for idx, opinion in enumerate(opinion_list):
        for dictionary in opinion:
            for key in dictionary:
                if key in most_common_aspect:
                    df.loc[idx,key] = dictionary[key]
    return df

In [281]:
category = get_most_common_aspect(train_opinion)
category

['FOOD#QUALITY',
 'SERVICE#GENERAL',
 'RESTAURANT#GENERAL',
 'AMBIENCE#GENERAL',
 'FOOD#STYLE_OPTIONS']

In [282]:
df_train = get_data_frame(data_review, train_opinion, category)
df_train.head(10)

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,Judging from previous posts this used to be a ...,negative,,,,
1,"We, there were four of us, arrived at noon - t...",,negative,,,
2,"They never brought us complimentary noodles, i...",,negative,,,
3,The food was lousy - too sweet or too salty an...,,,negative,negative,
4,"After all that, they complained to me about th...",,negative,,,
5,Avoid this place!,negative,,,,
6,"I have eaten at Saul, many times, the food is ...",,,positive,,
7,Saul is the best restaurant on Smith Street an...,positive,,,,
8,The duck confit is always amazing and the foie...,,,positive,,
9,The wine list is interesting and has many good...,,,,,


## Preprocessing

### Case Folding

In [283]:
def case_folding(opinion):
    folded = [x.lower() for x in opinion]
    return folded

### Remove Punctuation

In [284]:
# Menghapus tanda baca menggunakan string.punctuation
def remove_punctuation(text):
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

### Stop Word Removal

In [285]:
def stop_words(opinion):
    filteredsentence=[]
    stop_words = set(stopwords.words('english'))
    wordtokenize = word_tokenize(opinion)
    for word in wordtokenize:
        if word not in stop_words:
            filteredsentence.append(word)
    final_list=' '.join(filteredsentence)
    return final_list

### Lemmatization

In [286]:
def lemmatize_text(text):
    wnl = WordNetLemmatizer()
    tokens = word_tokenize(text)  # Tokenisasi kata
    pos_tags = nltk.pos_tag(tokens)  # Menentukan pos tag untuk setiap kata

    # Menggunakan pos tag untuk memberikan informasi yang tepat kepada lemmatiser
    lemmatized_words = []
    for token, pos in pos_tags:
        pos_tag = get_wordnet_pos(pos)
        lemmatized_word = wnl.lemmatize(token, pos=pos_tag)
        lemmatized_words.append(lemmatized_word)

    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(word) for word in text]

In [287]:
df_train['Review'] = case_folding(df_train['Review'])
df_train.head(10)

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,judging from previous posts this used to be a ...,negative,,,,
1,"we, there were four of us, arrived at noon - t...",,negative,,,
2,"they never brought us complimentary noodles, i...",,negative,,,
3,the food was lousy - too sweet or too salty an...,,,negative,negative,
4,"after all that, they complained to me about th...",,negative,,,
5,avoid this place!,negative,,,,
6,"i have eaten at saul, many times, the food is ...",,,positive,,
7,saul is the best restaurant on smith street an...,positive,,,,
8,the duck confit is always amazing and the foie...,,,positive,,
9,the wine list is interesting and has many good...,,,,,


In [288]:
df_train['Review'] = df_train['Review'].apply(remove_punctuation)
df_train.head(10)

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,judging from previous posts this used to be a ...,negative,,,,
1,we there were four of us arrived at noon the ...,,negative,,,
2,they never brought us complimentary noodles ig...,,negative,,,
3,the food was lousy too sweet or too salty and...,,,negative,negative,
4,after all that they complained to me about the...,,negative,,,
5,avoid this place,negative,,,,
6,i have eaten at saul many times the food is al...,,,positive,,
7,saul is the best restaurant on smith street an...,positive,,,,
8,the duck confit is always amazing and the foie...,,,positive,,
9,the wine list is interesting and has many good...,,,,,


In [289]:
df_train['Review'] = df_train['Review'].apply(lambda x: stop_words(x))
df_train

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,judging previous posts used good place longer,negative,,,,
1,four us arrived noon place empty staff acted l...,,negative,,,
2,never brought us complimentary noodles ignored...,,negative,,,
3,food lousy sweet salty portions tiny,,,negative,negative,
4,complained small tip,,negative,,,
...,...,...,...,...,...,...
1995,cant believe please put bag delivering food,,negative,,,
1996,waitress came check us every minutes began cle...,,negative,,,
1997,couldnt ignore fact reach plate one friends mi...,,negative,,,
1998,put check without asking done came check bill ...,,negative,,,


In [290]:
df_train['Review'] = df_train['Review'].apply(lemmatize_text)
df_train

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,judge previous post use good place longer,negative,,,,
1,four u arrive noon place empty staff act like ...,,negative,,,
2,never bring u complimentary noodle ignore repe...,,negative,,,
3,food lousy sweet salty portion tiny,,,negative,negative,
4,complain small tip,,negative,,,
...,...,...,...,...,...,...
1995,cant believe please put bag deliver food,,negative,,,
1996,waitress come check u every minute begin clear...,,negative,,,
1997,couldnt ignore fact reach plate one friends mi...,,negative,,,
1998,put check without ask do come check bill every...,,negative,,,


In [291]:
# Mengubah nilai aspek polaritas menjadi penanda label
def get_aspect_data_frame(df, most_common_aspect):
    for common_aspect in most_common_aspect:
        df[common_aspect] = df[common_aspect].replace(['positive','negative','neutral','conflict'],[1,1,1,1])
    df = df.fillna(0)
    return df

df_aspect = get_aspect_data_frame(df_train, category)
df_aspect

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL
0,judge previous post use good place longer,1.0,0.0,0.0,0.0,0.0
1,four u arrive noon place empty staff act like ...,0.0,1.0,0.0,0.0,0.0
2,never bring u complimentary noodle ignore repe...,0.0,1.0,0.0,0.0,0.0
3,food lousy sweet salty portion tiny,0.0,0.0,1.0,1.0,0.0
4,complain small tip,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1995,cant believe please put bag deliver food,0.0,1.0,0.0,0.0,0.0
1996,waitress come check u every minute begin clear...,0.0,1.0,0.0,0.0,0.0
1997,couldnt ignore fact reach plate one friends mi...,0.0,1.0,0.0,0.0,0.0
1998,put check without ask do come check bill every...,0.0,1.0,0.0,0.0,0.0


In [292]:
df_aspect['Label']= df_aspect['RESTAURANT#GENERAL']
df_aspect

Unnamed: 0,Review,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL,Label
0,judge previous post use good place longer,1.0,0.0,0.0,0.0,0.0,1.0
1,four u arrive noon place empty staff act like ...,0.0,1.0,0.0,0.0,0.0,0.0
2,never bring u complimentary noodle ignore repe...,0.0,1.0,0.0,0.0,0.0,0.0
3,food lousy sweet salty portion tiny,0.0,0.0,1.0,1.0,0.0,0.0
4,complain small tip,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1995,cant believe please put bag deliver food,0.0,1.0,0.0,0.0,0.0,0.0
1996,waitress come check u every minute begin clear...,0.0,1.0,0.0,0.0,0.0,0.0
1997,couldnt ignore fact reach plate one friends mi...,0.0,1.0,0.0,0.0,0.0,0.0
1998,put check without ask do come check bill every...,0.0,1.0,0.0,0.0,0.0,0.0


## Perhitungan TF IDF Manual

### Tokenisasi

In [293]:
def getToken(review):
    data_review = review
    token = [i.split() for i in review]
    all_token = sorted(list(set([item for sublist in token for item in sublist])))
    return data_review, token, all_token

In [294]:
data_review, token, alltokens = getToken(df_aspect['Review'])
data_review

0               judge previous post use good place longer
1       four u arrive noon place empty staff act like ...
2       never bring u complimentary noodle ignore repe...
3                     food lousy sweet salty portion tiny
4                                      complain small tip
                              ...                        
1995             cant believe please put bag deliver food
1996    waitress come check u every minute begin clear...
1997    couldnt ignore fact reach plate one friends mi...
1998    put check without ask do come check bill every...
1999    wish could like place wish someone would retra...
Name: Review, Length: 2000, dtype: object

In [295]:
token

[['judge', 'previous', 'post', 'use', 'good', 'place', 'longer'],
 ['four',
  'u',
  'arrive',
  'noon',
  'place',
  'empty',
  'staff',
  'act',
  'like',
  'impose',
  'rude'],
 ['never',
  'bring',
  'u',
  'complimentary',
  'noodle',
  'ignore',
  'repeated',
  'request',
  'sugar',
  'throw',
  'dish',
  'table'],
 ['food', 'lousy', 'sweet', 'salty', 'portion', 'tiny'],
 ['complain', 'small', 'tip'],
 ['avoid', 'place'],
 ['eaten',
  'saul',
  'many',
  'time',
  'food',
  'always',
  'consistently',
  'outrageously',
  'good'],
 ['saul', 'best', 'restaurant', 'smith', 'street', 'brooklyn'],
 ['duck',
  'confit',
  'always',
  'amaze',
  'foie',
  'gras',
  'terrine',
  'figs',
  'world'],
 ['wine', 'list', 'interest', 'many', 'good', 'value'],
 ['price', 'eat', 'well', 'manhattan'],
 ['disappointed', 'restaurant'],
 ['ive',
  'ask',
  'cart',
  'attendant',
  'lotus',
  'leaf',
  'wrap',
  'rice',
  'reply',
  'back',
  'rice',
  'walk',
  'away'],
 ['ask', 'three', 'time', 'fi

In [296]:
len(token)

2000

In [297]:
alltokens

['0',
 '1',
 '10',
 '100',
 '1015',
 '10piece',
 '11',
 '12',
 '120',
 '126',
 '13',
 '14',
 '15',
 '17',
 '170',
 '18',
 '1st',
 '2',
 '20',
 '2002',
 '20s30',
 '23rd',
 '24',
 '25',
 '29',
 '29per',
 '2nd',
 '2times',
 '3',
 '30',
 '300',
 '32nd',
 '33',
 '34',
 '35',
 '36',
 '4',
 '40',
 '40000',
 '45',
 '45mins',
 '48th',
 '4th',
 '5',
 '50',
 '500',
 '50th',
 '55',
 '56',
 '57th',
 '58',
 '5th',
 '6',
 '60',
 '620',
 '65',
 '6th',
 '7',
 '70',
 '74th',
 '8',
 '800just',
 '830',
 '850',
 '895',
 '8pm',
 '910',
 '930',
 '99',
 '9pm',
 'aback',
 'able',
 'aboveaverage',
 'abrupt',
 'absolutely',
 'absurdly',
 'abuse',
 'accept',
 'acceptable',
 'accident',
 'accidentally',
 'accidently',
 'accolades',
 'accomodating',
 'accompaniment',
 'accompany',
 'acidity',
 'acknowledgement',
 'acknowledgment',
 'acquire',
 'across',
 'act',
 'action',
 'actor',
 'actual',
 'actually',
 'ad',
 'add',
 'adderlys',
 'addition',
 'additional',
 'adequate',
 'adequately',
 'adjust',
 'admit',
 'admi

In [298]:
len(alltokens)

2945

### Perhitungan TF

In [299]:
docs = df_aspect['Review']

terms = alltokens
terms_2_dict = {term:i for i, term in enumerate(terms)}

df = []
for doc in docs: 
    df_kecil = np.zeros(len(terms))
    
    for word in doc.split():
        get_word_idx = terms_2_dict.get(word)
        if get_word_idx is not None:
            df_kecil[get_word_idx] += 1
    df.append(df_kecil)

df_tf = pd.DataFrame(np.array(df).T,columns=[f'd{i+1}' for i in range (len(df))], index=terms)
df_tf

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d1991,d1992,d1993,d1994,d1995,d1996,d1997,d1998,d1999,d2000
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zucchero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
–,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
‘,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [300]:
df_tf['d1'].loc['judge']

1.0

### Perhitungan WTF

In [301]:
def weight_tf(value):
    if value > 0:
        return 1 + math.log(value, 10)
    else:
        return 0

In [302]:
dfwtf = df_tf.applymap(weight_tf)
dfwtf

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d1991,d1992,d1993,d1994,d1995,d1996,d1997,d1998,d1999,d2000
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zucchero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
–,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
‘,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Perhitungan IDF

In [303]:
def calculate_idf(docs):
    idf_dict = {}
    total_documents = len(docs)
    for document in docs:
        words = set(document.lower().split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1
    for word, count in idf_dict.items():
        idf_dict[word] = math.log(total_documents / (count + 1))
    return idf_dict

idf= calculate_idf(docs)
idf

{'previous': 5.809142990314028,
 'post': 6.907755278982137,
 'use': 5.2030071867437115,
 'good': 2.396895772465287,
 'judge': 6.214608098422191,
 'place': 2.2256240518579173,
 'longer': 6.214608098422191,
 'noon': 6.907755278982137,
 'empty': 5.654992310486769,
 'like': 3.158251203051766,
 'arrive': 5.298317366548036,
 'four': 5.115995809754082,
 'staff': 3.6888794541139363,
 'rude': 4.767689115485866,
 'u': 3.7722610630529876,
 'impose': 6.907755278982137,
 'act': 6.502290170873972,
 'dish': 3.816712825623821,
 'throw': 6.502290170873972,
 'never': 3.5578511917075324,
 'request': 5.521460917862246,
 'repeated': 6.907755278982137,
 'ignore': 5.809142990314028,
 'table': 3.8873303928377747,
 'bring': 5.654992310486769,
 'complimentary': 5.809142990314028,
 'noodle': 5.654992310486769,
 'sugar': 6.907755278982137,
 'tiny': 5.521460917862246,
 'sweet': 5.654992310486769,
 'salty': 6.214608098422191,
 'portion': 4.8283137373023015,
 'lousy': 6.214608098422191,
 'food': 1.9554555618988447,


In [304]:
data_dict = idf
dfidf = pd.DataFrame(data_dict.items(), columns=['Word', 'Value'])
dfidf = dfidf.sort_values('Value', ascending= True)
dfidf

Unnamed: 0,Word,Value
33,food,1.955456
5,place,2.225624
3,good,2.396896
82,great,2.459239
97,go,2.465104
...,...,...
1577,roxys,6.907755
1576,pastry,6.907755
1572,colleague,6.907755
1570,utsav,6.907755


### Perhitungan TF IDF

In [305]:
def getTF_IDF(weight_tf, idf):
    df_tf_idf = weight_tf.multiply(idf, axis='index')
    return df_tf_idf

In [306]:
df_tf_idf = getTF_IDF(dfwtf, idf)
df_tf_idf.transpose()
df_tf_idf

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d1991,d1992,d1993,d1994,d1995,d1996,d1997,d1998,d1999,d2000
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zucchero,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
–,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
‘,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Perhitungan TF IDF

In [307]:
# Menghitung Term Frequency (TF)
def calculate_tf(text):
    tf_dict = {}
    words = text.lower().split()
    total_words = len(words)
    for word in words:
        tf_dict[word] = tf_dict.get(word, 0) + 1 / total_words
    return tf_dict

# Menghitung Inverse Document Frequency (IDF)
def calculate_idf(documents):
    idf_dict = {}
    total_documents = len(documents)
    for document in documents:
        words = set(document.lower().split())
        for word in words:
            idf_dict[word] = idf_dict.get(word, 0) + 1

    for word, count in idf_dict.items():
        idf_dict[word] = math.log(total_documents / (count + 1))

    return idf_dict

# Menghitung TF-IDF
def calculate_tfidf(tf, idf):
    tfidf_dict = {}
    for word, tf_value in tf.items():
        tfidf_dict[word] = tf_value * idf.get(word, 0)
    return tfidf_dict

# Menghitung TF-IDF untuk setiap dokumen dalam dataframe
tfidf_list = []
documents = docs

for document in documents:
    tf = calculate_tf(document)
    idf = calculate_idf(documents)
    tfidf = calculate_tfidf(tf, idf)
    tfidf_list.append(tfidf)

# Membuat dataframe dari hasil TF-IDF
df_tfidf = pd.DataFrame(tfidf_list)
df_tfidf.fillna(0, inplace=True)  # Mengisi nilai NaN dengan 0

# Menampilkan dataframe hasil TF-IDF
df_tfidf

Unnamed: 0,judge,previous,post,use,good,place,longer,four,u,arrive,...,tag,teamed,hold,bag,clear,pet,peeve,everywhere,occupy,retrain
0,0.887801,0.829878,0.986822,0.743287,0.342414,0.317946,0.887801,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.202329,0.000000,0.465091,0.342933,0.481665,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.314355,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.928899,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1996,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.342933,0.000000,...,0.0,0.0,0.0,0.000000,0.295559,0.313989,0.313989,0.313989,0.000000,0.000000
1997,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.591117,0.000000,0.000000,0.000000,0.000000,0.000000
1998,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.406339,0.000000


### Download Hasil Dataframe TF IDF

In [308]:
# nama_file = 'dataframe_tfidf.xlsx'
# dftfidf.to_csv(nama_file, index=False)

In [309]:
df_aspect = df_aspect.drop('Review', axis = 1)
df_aspect

Unnamed: 0,RESTAURANT#GENERAL,SERVICE#GENERAL,FOOD#QUALITY,FOOD#STYLE_OPTIONS,AMBIENCE#GENERAL,Label
0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
1995,0.0,1.0,0.0,0.0,0.0,0.0
1996,0.0,1.0,0.0,0.0,0.0,0.0
1997,0.0,1.0,0.0,0.0,0.0,0.0
1998,0.0,1.0,0.0,0.0,0.0,0.0


## Modelling Classifier Support Vector Machine

In [310]:
from sklearn.model_selection import train_test_split

# Misalkan Anda memiliki data fitur 'X' dan target 'y'
X = df_tfidf  # Data fitur
y = df_aspect # Data target

# Memisahkan data menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan ukuran set pelatihan dan pengujian
print("Jumlah data set pelatihan:", len(X_train))
print("Jumlah data set pengujian:", len(X_test))

Jumlah data set pelatihan: 1600
Jumlah data set pengujian: 400


In [311]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.base import BaseEstimator, ClassifierMixin, clone

class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, base_classifier=LogisticRegression()):
        self.base_classifier=base_classifier

    def fit(self, X, y):
        """Build a Binary Relevance classifier from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        y : array-like, shape = [n_samples, n_labels] 
            The target values (class labels) as integers or strings.
        """

        # list of individual classifiers
        self.models = []
       
        # for every class label
        for label in list(y.columns):
            # Check that X and y have correct shape
            x_checked, y_checked = check_X_y(X, y[label])
            # every classifier is independent of the others
            # hence we create a copy of the base classifier instance
            base_model = clone(self.base_classifier)
            # fit the base model - one model each for Y1, Y2....Y14
            basel_model = base_model.fit(x_checked, y_checked)
            # add the fitted model list of individual classifiers
            self.models.append(base_model)

    # The predict function to make a set of predictions for a set of query instances
    def predict(self, X):
        # check if the models list has been set up
        check_is_fitted(self, ['models'])
        X = check_array(X)
        
        all_preds = pd.DataFrame()
        i=0
        # list of individual classifier predictions
        preds = []
        
        # predict against each fitted model - one model per label
        for model in self.models:
            pred = model.predict(X)
            # add the prediction to the dataframe
            preds.append(pd.DataFrame({'Class'+ str(i+1): pred}))
            i+=1
        
        # dataframe with predictions for all class labels
        all_preds = pd.concat(preds, axis=1)
        # standard sklearn classifiers return predictions as numpy arrays
        # hence convert the dataframe to a numpy array
        return all_preds.to_numpy()
    
    def predict_proba(self,X):
        # check if the models list has been set up
        check_is_fitted(self, ['models'])
        X = check_array(X)
        
        all_preds = pd.DataFrame()
        i = 0
        
        for model in self.models:
            # Call predict_proba of the each base model
            pred = model.predict_proba(X)
            # Add the probabilities of 1 to the dataframe
            all_preds['Class'+str(i+1)] = [one_prob[1] for one_prob in pred]
            i+=1
        
        #return probabilities
        return all_preds.to_numpy()

In [312]:
# instantiate the classifier
br_clf = BinaryRelevanceClassifier(LogisticRegression())
# fit
br_clf.fit(X_train, y_train)
# predict
y_pred = br_clf.predict(X_test)
print("y_pred.shape: " + str(y_pred.shape))

y_pred.shape: (400, 6)


In [313]:
def accuracy_score(y_test, y_pred):
    # y_pred is a numpy array, y_test is a dataframe
    # to compare the two, convert to a single type
    y_test = y_test.to_numpy()
    
    # shape of test and preds must be equal
    assert y_test.shape == y_pred.shape
    i=0
    # list of scores for each training sample
    scores = []
    
    # for each test sample
    while i < len(y_test):
        count=0
        # count the number of matches in the sample
        # y_test[i] -> row values in test set (true values)
        # y_pred[i] -> row values in predictions set (predicted values)
        for p, q in zip(y_test[i], y_pred[i]):
            if p == q:
                count += 1

        # accuracy score for the sample = no. of correctly predicted labels/total no. of labels
        scores.append(count / y_pred.shape[1])
        i+=1 

    # final accuracy = avg. accuracy over all test samples =
    # sum of the accuracy of all training samples/no. of training samples
    return round((sum(scores)/len(y_test)), 5)

In [314]:
print("Accuracy of Binary Relevance Classifier: " + str(accuracy_score(y_test, y_pred)))

Accuracy of Binary Relevance Classifier: 0.86375


In [315]:
print(y_test)

      RESTAURANT#GENERAL  SERVICE#GENERAL  FOOD#QUALITY  FOOD#STYLE_OPTIONS   
1860                 0.0              1.0           1.0                 0.0  \
353                  0.0              0.0           0.0                 0.0   
1333                 0.0              0.0           0.0                 0.0   
905                  1.0              0.0           0.0                 0.0   
1289                 0.0              0.0           0.0                 0.0   
...                  ...              ...           ...                 ...   
965                  1.0              0.0           0.0                 0.0   
1284                 0.0              0.0           0.0                 0.0   
1739                 0.0              0.0           0.0                 0.0   
261                  1.0              0.0           0.0                 0.0   
535                  1.0              0.0           0.0                 0.0   

      AMBIENCE#GENERAL  Label  
1860               

In [316]:
class SVM:
    def __init__(self, learning_rate=0.001, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.weights = np.zeros(num_features)
        self.bias = 0

        for _ in range(self.num_iterations):
            linear_output = np.dot(X, self.weights) + self.bias
            y_predicted = np.where(linear_output >= 0, 1, -1)

            dW = (1 / num_samples) * np.dot(X.T, (y - y_predicted))
            dB = (1 / num_samples) * np.sum(y - y_predicted)

            self.weights += self.learning_rate * dW
            self.bias += self.learning_rate * dB
    
    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        y_predicted = np.where(linear_output >= 0, 1, -1)
        return y_predicted

# # Initialize SVM classifier
# svm = SVM()

# # Train the SVM classifier
# svm.fit(np.array(features_train), labels_train)

# # Predict the aspect categories for test data
# predictions = svm.predict(np.array(features_test))

# # Evaluation
# def accuracy(y_true, y_pred):
#     return np.mean(y_true == y_pred)

# acc = accuracy(labels_test, predictions)
# print("Accuracy:", acc)

In [317]:
# instantiate the classifier
svm_clf = SVM(SVC())
# fit
svm_clf.fit(X_train, y_train)
# predict
y_pred = svm_clf.predict(X_test)
print("y_pred.shape: " + str(y_pred.shape))

ValueError: Unable to coerce to Series, length must be 6: given 1600

In [None]:
def accuracy_score(y_test, y_pred):
    # y_pred is a numpy array, y_test is a dataframe
    # to compare the two, convert to a single type
    y_test = y_test.to_numpy()
    
    # shape of test and preds must be equal
    assert y_test.shape == y_pred.shape
    i=0
    # list of scores for each training sample
    scores = []
    
    # for each test sample
    while i < len(y_test):
        count=0
        # count the number of matches in the sample
        # y_test[i] -> row values in test set (true values)
        # y_pred[i] -> row values in predictions set (predicted values)
        for p, q in zip(y_test[i], y_pred[i]):
            if p == q:
                count += 1

        # accuracy score for the sample = no. of correctly predicted labels/total no. of labels
        scores.append(count / y_pred.shape[1])
        i+=1 

    # final accuracy = avg. accuracy over all test samples =
    # sum of the accuracy of all training samples/no. of training samples
    return round((sum(scores)/len(y_test)), 5)

In [None]:
print("Accuracy of Support Vector Machine Classifier: " + str(accuracy_score(y_test, y_pred)))

AssertionError: 

In [None]:
print(y_test)

      RESTAURANT#GENERAL  SERVICE#GENERAL  FOOD#QUALITY  FOOD#STYLE_OPTIONS   
1860                 0.0              1.0           1.0                 0.0  \
353                  0.0              0.0           0.0                 0.0   
1333                 0.0              0.0           0.0                 0.0   
905                  1.0              0.0           0.0                 0.0   
1289                 0.0              0.0           0.0                 0.0   
...                  ...              ...           ...                 ...   
965                  1.0              0.0           0.0                 0.0   
1284                 0.0              0.0           0.0                 0.0   
1739                 0.0              0.0           0.0                 0.0   
261                  1.0              0.0           0.0                 0.0   
535                  1.0              0.0           0.0                 0.0   

      AMBIENCE#GENERAL  Label  
1860               

In [None]:
print(y_pred)

[[0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
