In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
from pairing import Reader, Extractor, BaselineClassifier, GBClassifier, FilteredGBClassifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, confusion_matrix

## Data Loading and Preprocessing

In [2]:
raw_data = Reader.read_file(definition.DATA_PAIRED_TRAIN)
raw_data[0]

{'token': ['kamar',
  'saya',
  'ada',
  'kendala',
  'di',
  'ac',
  'tidak',
  'berfungsi',
  'optimal',
  '.',
  'dan',
  'juga',
  'wifi',
  'koneksi',
  'kurang',
  'stabil',
  '.'],
 'label': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-ASPECT',
  'B-SENTIMENT',
  'I-SENTIMENT',
  'I-SENTIMENT',
  'O',
  'O',
  'O',
  'B-ASPECT',
  'I-ASPECT',
  'B-SENTIMENT',
  'I-SENTIMENT',
  'O'],
 'aspect': [{'start': 5, 'length': 1}, {'start': 12, 'length': 2}],
 'sentiment': [{'start': 6, 'length': 3, 'index_aspect': [0]},
  {'start': 14, 'length': 2, 'index_aspect': [1]}]}

In [3]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"
clustering_filename = "fasttext_25_kmeans_10.pkl"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename),
    clustering_filename=os.path.join(definition.MODEL_UTILITY, clustering_filename))
data = extractor.extract_data(raw_data)

Extracting data: 100%|█████████████████████████████████████████████████████████████| 4000/4000 [00:50<00:00, 79.17it/s]


In [4]:
print('TOTAL', '[+]', '[-]', sep='\t')
print(len(data), len(data[data['target']==1]), len(data[data['target']==0]), sep='\t')

TOTAL	[+]	[-]
27894	8748	19146


In [5]:
X = data.drop('target', axis=1)
y = data['target']

In [6]:
X.describe(include="all")

Unnamed: 0,_id_aspect,_id_closest_sentiment,_id_sentence,_id_sentiment,_n_aspect,_n_sentiment,c_aspect_0,c_aspect_1,c_aspect_2,c_aspect_3,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,...,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0,27894.0
mean,1.5812,1.90955,1979.759984,1.953646,4.162401,4.907292,0.007242,0.072955,0.123181,0.00803,...,-0.135304,-1.471815,-0.847627,1.644095,-2.659247,-1.523839,2.037266,-0.163768,-0.100259,0.622427
std,1.793977,1.995934,1148.287233,2.013907,2.360831,2.529206,0.084791,0.260067,0.32865,0.089254,...,2.231485,2.32412,1.444809,1.796288,2.045778,1.737901,1.558514,1.935848,1.631455,1.972799
min,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,-11.651938,-11.382226,-6.450116,-7.797521,-8.07099,-8.140283,-9.02572,-4.582297,-9.375645,-7.301321
25%,0.0,0.0,960.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,...,-1.621894,-2.427751,-1.723075,0.530434,-3.964093,-2.480384,1.283175,-1.595173,-1.107873,-0.056265
50%,1.0,1.0,1994.0,1.0,4.0,4.0,0.0,0.0,0.0,0.0,...,-0.107184,-1.104406,-1.189796,1.901815,-2.745073,-1.55688,2.232946,-0.171981,-0.107229,0.921577
75%,2.0,3.0,2963.0,3.0,5.0,6.0,0.0,0.0,0.0,0.0,...,1.78017,0.127728,0.060727,2.604244,-0.947267,-0.412323,3.324056,0.731406,0.863757,1.899509
max,11.0,13.0,3999.0,13.0,12.0,14.0,1.0,1.0,1.0,1.0,...,3.974328,4.923599,7.467762,7.160624,5.283782,3.990031,5.802044,8.856901,5.893603,7.187962


## Validation

In [7]:
dummy_features = ['_id_sentence', '_id_aspect', '_id_sentiment', '_id_closest_sentiment', '_n_aspect', '_n_sentiment']

# Set dropped labels depending on classifier model
def drop_dummy_feature(X):
    return X.drop(labels=['_id_sentence', '_id_aspect', '_id_sentiment', '_id_closest_sentiment', '_n_aspect'], axis=1)

In [8]:
n_splits = 5

available_sentence_id = pd.unique(X['_id_sentence'])
kfold = KFold(n_splits=n_splits, shuffle=True)
split_idx = 1
f1_0_total = 0.0
f1_1_total = 0.0
f1_a_total = 0.0
for train_sentence_id_index, test_sentence_id_index in kfold.split(available_sentence_id):
    print("[Split {}/{}]".format(split_idx, n_splits))
    split_idx += 1
    
    train_sentence_id = available_sentence_id[train_sentence_id_index]
    train_pointer = X['_id_sentence'].isin(train_sentence_id)
    X_train = X[train_pointer]
    X_test = X[np.bitwise_not(train_pointer)]
    y_train = y[train_pointer]
    y_test = y[np.bitwise_not(train_pointer)]
    
    model = FilteredGBClassifier()
    model.fit(drop_dummy_feature(X_train), y_train)
    pred = model.predict(drop_dummy_feature(X_test))
    f1_0 = f1_score(y_test, pred, pos_label=0)
    f1_1 = f1_score(y_test, pred, pos_label=1)
    f1_a = f1_score(y_test, pred, average='macro')
    f1_0_total += f1_0
    f1_1_total += f1_1
    f1_a_total += f1_a
    print("f1_0 : {}".format(f1_0))
    print("f1_1 : {}".format(f1_1))
    print("f1_a : {}".format(f1_a))
    print("")
    print(GBClassifier.generate_confusion_matrix_table(y_test, pred))
    print("")
    
print("[Summary]")
print("f1_0 : {}".format(f1_0_total/n_splits))
print("f1_1 : {}".format(f1_1_total/n_splits))
print("f1_a : {}".format(f1_a_total/n_splits))

[Split 1/5]
f1_0 : 0.955124083836955
f1_1 : 0.8968980797636631
f1_a : 0.9260110818003091

        predicted_0  predicted_1
true_0         3714          102
true_1          247         1518

[Split 2/5]
f1_0 : 0.9569646040234274
f1_1 : 0.9025374855824684
f1_a : 0.9297510448029479

        predicted_0  predicted_1
true_0         3758          131
true_1          207         1565

[Split 3/5]
f1_0 : 0.9575742132755766
f1_1 : 0.9020300088261253
f1_a : 0.929802111050851

        predicted_0  predicted_1
true_0         3758          121
true_1          212         1533

[Split 4/5]
f1_0 : 0.9555079067274189
f1_1 : 0.9016004742145821
f1_a : 0.9285541904710005

        predicted_0  predicted_1
true_0         3565          123
true_1          209         1521

[Split 5/5]
f1_0 : 0.9550031867431484
f1_1 : 0.8954074074074074
f1_a : 0.925205297075278

        predicted_0  predicted_1
true_0         3746          128
true_1          225         1511

[Summary]
f1_0 : 0.9560347989213053
f1_1 : 0.899

## Training

In [9]:
model.fit(drop_dummy_feature(X), y)

In [10]:
model_pairing_filename = "pairing_final.pkl"
model.save(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))