In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from pairing import Reader, Extractor, BaselineClassifier, FilteredGBClassifier
import definition

## Data Loading

In [2]:
raw_test = Reader.read_file(definition.DATA_PAIRED_TEST)

## Feature Extraction

In [3]:
embedding_filename = "word2vec_50.bin"
word_count_filename = "word_count_60.pkl"
clustering_filename = "word2vec_50_kmeans_50.pkl"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename),
    clustering_filename=os.path.join(definition.MODEL_UTILITY, clustering_filename))

In [4]:
data = extractor.extract_data(raw_test, progress_bar=True)
X = data.drop('target', axis=1)
y = data['target']
data.head()

Extracting data: 100%|█████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 82.12it/s]


Unnamed: 0,_id_aspect,_id_closest_sentiment,_id_sentence,_id_sentiment,_n_aspect,_n_sentiment,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,...,v_sentiment_45,v_sentiment_46,v_sentiment_47,v_sentiment_48,v_sentiment_49,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0,0,0,0,4,4,0.267908,0.101278,1,0.320047,...,-0.108285,-0.642952,-0.112356,-1.794697,-0.522773,1.979624,-1.259967,0.477163,3.576442,1.434866
1,0,0,0,1,4,4,0.267908,0.303174,1,0.417181,...,-0.887965,-0.39967,2.107079,-3.530737,0.874552,1.66572,-0.70683,-0.291078,2.956153,0.3381
2,0,0,0,2,4,4,0.267908,0.166179,1,0.420656,...,1.947059,1.874121,1.645182,-2.73018,1.199623,0.980325,0.29951,-1.215355,1.783817,-1.631933
3,0,0,0,3,4,4,0.267908,0.175303,1,0.643507,...,-0.645404,3.325661,1.444411,-1.549783,-0.22638,-1.563441,0.527881,0.363376,-0.679948,1.696995
4,1,1,0,0,4,4,0.674848,0.085781,1,0.320047,...,-0.108285,-0.642952,-0.112356,-1.794697,-0.522773,1.979624,-1.259967,0.477163,3.576442,1.434866


In [5]:
data_chen = extractor.extract_data(raw_test, progress_bar=True, include_new_features=False)
X_chen = data_chen.drop('target', axis=1)
data_chen.head()

Extracting data: 100%|█████████████████████████████████████████████████████████████| 1000/1000 [00:12<00:00, 78.42it/s]


Unnamed: 0,_id_aspect,_id_closest_sentiment,_id_sentence,_id_sentiment,_n_aspect,_n_sentiment,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,...,v_sentiment_45,v_sentiment_46,v_sentiment_47,v_sentiment_48,v_sentiment_49,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0,0,0,0,4,4,0.267908,0.101278,1,0.320047,...,-0.108285,-0.642952,-0.112356,-1.794697,-0.522773,1.979624,-1.259967,0.477163,3.576442,1.434866
1,0,0,0,1,4,4,0.267908,0.303174,1,0.417181,...,-0.887965,-0.39967,2.107079,-3.530737,0.874552,1.66572,-0.70683,-0.291078,2.956153,0.3381
2,0,0,0,2,4,4,0.267908,0.166179,1,0.420656,...,1.947059,1.874121,1.645182,-2.73018,1.199623,0.980325,0.29951,-1.215355,1.783817,-1.631933
3,0,0,0,3,4,4,0.267908,0.175303,1,0.643507,...,-0.645404,3.325661,1.444411,-1.549783,-0.22638,-1.563441,0.527881,0.363376,-0.679948,1.696995
4,1,1,0,0,4,4,0.674848,0.085781,1,0.320047,...,-0.108285,-0.642952,-0.112356,-1.794697,-0.522773,1.979624,-1.259967,0.477163,3.576442,1.434866


## Pairing

In [6]:
dummy_features = ['_id_sentence', '_id_aspect', '_id_sentiment', '_id_closest_sentiment', '_n_aspect', '_n_sentiment']

# Set dropped labels depending on classifier model
def drop_dummy_feature_baseline(X):
    return X.drop(labels=['_id_sentence', '_id_aspect', '_n_aspect'], axis=1)

# Set dropped labels depending on classifier model
def drop_dummy_feature(X):
    return X.drop(labels=['_id_sentence', '_id_aspect', '_id_sentiment', '_id_closest_sentiment', '_n_aspect'], axis=1)

In [7]:
baseline = BaselineClassifier()
gb = FilteredGBClassifier()
gb_chen = FilteredGBClassifier()

gb.load(os.path.join(definition.MODEL_PAIRING, "pairing_final.pkl"))
gb_chen.load(os.path.join(definition.MODEL_PAIRING, "pairing_final_chen.pkl"))

In [8]:
pred_baseline = baseline.predict(drop_dummy_feature_baseline(X))
pred_gb = gb.predict(drop_dummy_feature(X))
pred_gb_chen = gb_chen.predict(drop_dummy_feature(X_chen))

print("Baseline : ", end='')
print(BaselineClassifier.f1_score(y, pred_baseline, average='macro'))
print("TA : ", end='')
print(FilteredGBClassifier.f1_score(y, pred_gb, average='macro'))
print("Chen : ", end = '')
print(FilteredGBClassifier.f1_score(y, pred_gb_chen, average='macro'))

Baseline : 0.9231344012979777
TA : 0.9335691571531568
Chen : 0.9347613612335239


## Demo Sample

In [16]:
demo_index = 2

In [17]:
' '.join(raw_test[demo_index]['token'])

'dengan harga terjangkau kita sudah mendapatkan fasilitas yang nyaman .'

In [18]:
pd.DataFrame({key:raw_test[demo_index][key] for key in ['token', 'label']})

Unnamed: 0,token,label
0,dengan,O
1,harga,B-ASPECT
2,terjangkau,B-SENTIMENT
3,kita,O
4,sudah,O
5,mendapatkan,O
6,fasilitas,B-ASPECT
7,yang,O
8,nyaman,B-SENTIMENT
9,.,O


In [19]:
# Ground Truth

positive_pairs_true = X.iloc[y.values==1][['_id_sentence', '_id_aspect', '_id_sentiment']]
positive_pairs_true = positive_pairs_true[positive_pairs_true['_id_sentence'] == demo_index].reset_index(drop=True)[['_id_aspect', '_id_sentiment']].values
for pair in positive_pairs_true:
    aspect_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['aspect'][pair[0]]['start']:raw_test[demo_index]['aspect'][pair[0]]['start']+raw_test[demo_index]['aspect'][pair[0]]['length']]]
    sentiment_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['sentiment'][pair[1]]['start']:raw_test[demo_index]['sentiment'][pair[1]]['start']+raw_test[demo_index]['sentiment'][pair[1]]['length']]]
    print("(%s, %s)" % (' '.join(aspect_terms), ' '.join(sentiment_terms)))

(harga, terjangkau)
(fasilitas, nyaman)


In [20]:
# Baseline

positive_pairs_baseline = X.iloc[pred_baseline==1][['_id_sentence', '_id_aspect', '_id_sentiment']]
positive_pairs_baseline = positive_pairs_baseline[positive_pairs_baseline['_id_sentence'] == demo_index].reset_index(drop=True)[['_id_aspect', '_id_sentiment']].values
for pair in positive_pairs_baseline:
    aspect_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['aspect'][pair[0]]['start']:raw_test[demo_index]['aspect'][pair[0]]['start']+raw_test[demo_index]['aspect'][pair[0]]['length']]]
    sentiment_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['sentiment'][pair[1]]['start']:raw_test[demo_index]['sentiment'][pair[1]]['start']+raw_test[demo_index]['sentiment'][pair[1]]['length']]]
    print("(%s, %s)" % (' '.join(aspect_terms), ' '.join(sentiment_terms)))

(harga, terjangkau)
(fasilitas, nyaman)


In [21]:
# Chen et al. (2018)

positive_pairs_gb_chen = X.iloc[pred_gb_chen==1][['_id_sentence', '_id_aspect', '_id_sentiment']]
positive_pairs_gb_chen = positive_pairs_gb_chen[positive_pairs_gb_chen['_id_sentence'] == demo_index].reset_index(drop=True)[['_id_aspect', '_id_sentiment']].values
for pair in positive_pairs_gb_chen:
    aspect_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['aspect'][pair[0]]['start']:raw_test[demo_index]['aspect'][pair[0]]['start']+raw_test[demo_index]['aspect'][pair[0]]['length']]]
    sentiment_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['sentiment'][pair[1]]['start']:raw_test[demo_index]['sentiment'][pair[1]]['start']+raw_test[demo_index]['sentiment'][pair[1]]['length']]]
    print("(%s, %s)" % (' '.join(aspect_terms), ' '.join(sentiment_terms)))

(harga, terjangkau)
(fasilitas, nyaman)


In [22]:
# TA

positive_pairs_gb = X.iloc[pred_gb==1][['_id_sentence', '_id_aspect', '_id_sentiment']]
positive_pairs_gb = positive_pairs_gb[positive_pairs_gb['_id_sentence'] == demo_index].reset_index(drop=True)[['_id_aspect', '_id_sentiment']].values
for pair in positive_pairs_gb:
    aspect_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['aspect'][pair[0]]['start']:raw_test[demo_index]['aspect'][pair[0]]['start']+raw_test[demo_index]['aspect'][pair[0]]['length']]]
    sentiment_terms = [token for token in raw_test[demo_index]['token'][raw_test[demo_index]['sentiment'][pair[1]]['start']:raw_test[demo_index]['sentiment'][pair[1]]['start']+raw_test[demo_index]['sentiment'][pair[1]]['length']]]
    print("(%s, %s)" % (' '.join(aspect_terms), ' '.join(sentiment_terms)))

(harga, terjangkau)
(fasilitas, nyaman)
