In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
from pairing import Reader, Extractor, GBClassifier, FilteredGBClassifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix

## Data Loading and Preprocessing

In [16]:
raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
raw_data[0]

{'token': ['kamar',
  'saya',
  'ada',
  'kendala',
  'di',
  'ac',
  'tidak',
  'berfungsi',
  'optimal',
  '.',
  'dan',
  'juga',
  'wifi',
  'koneksi',
  'kurang',
  'stabil',
  '.'],
 'label': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-ASPECT',
  'B-SENTIMENT',
  'I-SENTIMENT',
  'I-SENTIMENT',
  'O',
  'O',
  'O',
  'B-ASPECT',
  'I-ASPECT',
  'B-SENTIMENT',
  'I-SENTIMENT',
  'O'],
 'aspect': [{'start': 5, 'length': 1}, {'start': 12, 'length': 2}],
 'sentiment': [{'start': 6, 'length': 3, 'index_aspect': [0]},
  {'start': 14, 'length': 2, 'index_aspect': [1]}]}

In [3]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"
clustering_filename = "kmeans_10.bin"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename),
    clustering_filename=os.path.join(definition.MODEL_UTILITY, clustering_filename))
data = extractor.extract_data(raw_data)

Extracting data: 100%|█████████████████████████████████████████████████████████████| 3504/3504 [00:45<00:00, 77.28it/s]


In [4]:
print('TOTAL', '[+]', '[-]', sep='\t')
print(len(data), len(data[data['target']==1]), len(data[data['target']==0]), sep='\t')

TOTAL	[+]	[-]
24889	7735	17154


In [5]:
X = data.drop('target', axis=1)
y = data['target']

In [6]:
X.describe(include="all")

Unnamed: 0,_id_sentence,_n_aspect,_n_sentiment,c_aspect_0,c_aspect_1,c_aspect_2,c_aspect_3,c_aspect_4,c_aspect_5,c_aspect_6,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,...,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0,24889.0
mean,1764.48194,4.242316,4.985777,0.005786,0.010969,0.501025,0.144481,0.079794,0.128008,0.008799,...,0.160599,-0.273541,-0.805952,-0.334353,-1.283224,-1.755902,0.066493,1.720652,0.506999,-0.727312
std,1022.274208,2.423298,2.583894,0.075845,0.104158,0.500009,0.351584,0.27098,0.334106,0.093392,...,2.0488,2.032854,2.094116,2.138881,1.659208,1.639468,1.845303,1.870279,2.038841,1.257544
min,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-7.426899,-6.648667,-6.352921,-7.240832,-7.845984,-6.430819,-7.46223,-7.700941,-8.311359,-7.156436
25%,839.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.289075,-1.514033,-1.79201,-2.2021,-2.192458,-2.518814,-1.096474,0.267798,-0.406137,-1.364622
50%,1771.0,4.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.486262,0.142744,-0.790007,-0.210302,-1.457078,-1.851305,-0.274611,2.293756,0.836454,-0.904011
75%,2671.0,5.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.384099,1.26679,0.393694,0.783933,-0.165121,-0.829747,1.027948,3.116357,1.87804,0.115628
max,3503.0,12.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,6.758648,7.202626,4.012051,7.205663,5.160348,8.79932,6.164371,6.436256,7.608002,6.568245


## Validation

In [7]:
def drop_dummy_feature(X):
    return X.drop(labels=['_id_sentence', '_id_aspect', '_id_sentiment', '_n_aspect'], axis=1)

In [8]:
n_splits = 5

available_sentence_id = pd.unique(X['_id_sentence'])
kfold = KFold(n_splits=n_splits, shuffle=True)
split_idx = 1
f1_0_total = 0.0
f1_1_total = 0.0
f1_a_total = 0.0
for train_sentence_id_index, test_sentence_id_index in kfold.split(available_sentence_id):
    print("[Split {}/{}]".format(split_idx, n_splits))
    split_idx += 1
    
    train_sentence_id = available_sentence_id[train_sentence_id_index]
    train_pointer = X['_id_sentence'].isin(train_sentence_id)
    X_train = X[train_pointer]
    X_test = X[np.bitwise_not(train_pointer)]
    y_train = y[train_pointer]
    y_test = y[np.bitwise_not(train_pointer)]
    
    model = FilteredGBClassifier()
    model.fit(drop_dummy_feature(X_train), y_train)
    pred = model.predict(drop_dummy_feature(X_test))
    f1_0 = f1_score(y_test, pred, pos_label=0)
    f1_1 = f1_score(y_test, pred, pos_label=1)
    f1_a = f1_score(y_test, pred, average='macro')
    f1_0_total += f1_0
    f1_1_total += f1_1
    f1_a_total += f1_a
    print("f1_0 : {}".format(f1_0))
    print("f1_1 : {}".format(f1_1))
    print("f1_a : {}".format(f1_a))
    print("")
    print(GBClassifier.generate_confusion_matrix_table(y_test, pred))
    print("")
    
print("[Summary]")
print("f1_0 : {}".format(f1_0_total/n_splits))
print("f1_1 : {}".format(f1_1_total/n_splits))
print("f1_a : {}".format(f1_a_total/n_splits))

[Split 1/5]
f1_0 : 0.951847365233192
f1_1 : 0.8937875751503006
f1_a : 0.9228174701917463

        predicted_0  predicted_1
true_0         3143          112
true_1          206         1338

[Split 2/5]
f1_0 : 0.9607493309545049
f1_1 : 0.9091534755677908
f1_a : 0.9349514032611479

        predicted_0  predicted_1
true_0         3231          103
true_1          161         1321

[Split 3/5]
f1_0 : 0.9538283398465769
f1_1 : 0.8922661263086796
f1_a : 0.9230472330776283

        predicted_0  predicted_1
true_0         3295          110
true_1          209         1321

[Split 4/5]
f1_0 : 0.9597187637322396
f1_1 : 0.9087288416860272
f1_a : 0.9342238027091334

        predicted_0  predicted_1
true_0         3276           90
true_1          185         1369

[Split 5/5]
f1_0 : 0.952948801036941
f1_1 : 0.8837656099903939
f1_a : 0.9183572055136675

        predicted_0  predicted_1
true_0         3676          118
true_1          245         1380

[Summary]
f1_0 : 0.9558185201606909
f1_1 : 0.89

## Training

In [9]:
model = FilteredGBClassifier()
model.fit(drop_dummy_feature(X), y)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [10]:
model_pairing_filename = "pairing_final.pkl"
model.save(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))