In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from pairing import Reader, Extractor, FilteredGBClassifier
import definition

## Data Loading

In [2]:
raw_data = Reader.read_file(definition.DATA_LABELLED_SAMPLE, with_target=False)
' '.join(raw_data[0]['token'])

'kamar oke , bersih , rapi dan fasilitas oke . hanya ac yang tidak bisa dingin , walaupun sudah di setting suhu rendah dan fan jetfan . mohon diperbaiki . overall oke . terima kasih .'

In [3]:
pd.DataFrame({key:raw_data[0][key] for key in ['token', 'label']})

Unnamed: 0,token,label
0,kamar,B-ASPECT
1,oke,B-SENTIMENT
2,",",O
3,bersih,B-SENTIMENT
4,",",O
5,rapi,B-SENTIMENT
6,dan,O
7,fasilitas,B-ASPECT
8,oke,B-SENTIMENT
9,.,O


## Feature Extraction

In [4]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"
clustering_filename = "kmeans_10.bin"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename),
    clustering_filename=os.path.join(definition.MODEL_UTILITY, clustering_filename))

data = extractor.extract_data(raw_data, progress_bar=False, with_target=False)
data.head()

Unnamed: 0,_id_aspect,_id_sentence,_id_sentiment,_n_aspect,_n_sentiment,c_aspect_0,c_aspect_1,c_aspect_2,c_aspect_3,c_aspect_4,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0,0,0,4,6,0.0,0.0,1.0,0.0,0.0,...,2.662194,4.456686,-0.346311,0.535727,0.365226,0.446712,3.81956,3.2129,0.653219,0.586194
1,0,0,1,4,6,0.0,0.0,1.0,0.0,0.0,...,0.854022,1.550181,-1.105873,-3.82473,-1.457078,-2.518814,-0.274611,2.614733,0.836454,-1.148345
2,0,0,2,4,6,0.0,0.0,1.0,0.0,0.0,...,1.006981,0.890579,-0.016912,-4.710052,-3.028859,-1.737442,-0.764046,3.30333,1.02123,-1.742602
3,0,0,3,4,6,0.0,0.0,1.0,0.0,0.0,...,2.662194,4.456686,-0.346311,0.535727,0.365226,0.446712,3.81956,3.2129,0.653219,0.586194
4,0,0,4,4,6,0.0,0.0,1.0,0.0,0.0,...,-2.203774,-4.344076,4.012051,0.783933,-2.05429,-1.234127,-0.754624,3.932025,-2.572457,-2.025486


## Pairing

In [5]:
def drop_dummy_feature(X):
    return X.drop(labels=['_id_sentence', '_id_aspect', '_id_sentiment', '_n_aspect'], axis=1)

In [6]:
gb = FilteredGBClassifier()

model_pairing_filename = "pairing_final.pkl"
gb.load(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))

In [7]:
prediction = gb.predict(drop_dummy_feature(data))
positive_pairs = data.iloc[prediction==1][['_id_aspect', '_id_sentiment']].values
for pair in positive_pairs:
    aspect_terms = [token for token in raw_data[0]['token'][raw_data[0]['aspect'][pair[0]]['start']:raw_data[0]['aspect'][pair[0]]['start']+raw_data[0]['aspect'][pair[0]]['length']]]
    sentiment_terms = [token for token in raw_data[0]['token'][raw_data[0]['sentiment'][pair[1]]['start']:raw_data[0]['sentiment'][pair[1]]['start']+raw_data[0]['sentiment'][pair[1]]['length']]]
    print(' '.join(aspect_terms), '\u2192', ' '.join(sentiment_terms))

kamar → oke
kamar → bersih
kamar → rapi
fasilitas → oke
ac → tidak bisa dingin
overall → oke
