In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

from pairing import Reader, Extractor, Classifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix



## Data Loading and Preprocessing

In [2]:
train_ratio = 0.5

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

extractor = Extractor(embedding_filename=definition.MODEL_EMBEDDING_FASTTEXT)
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:10<00:00, 130.04it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:11<00:00, 122.28it/s]


In [3]:
data_train.head()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0.413855,0.158756,0.803234,5,5,2,1,7,1,0,...,3.196746,0.83631,-2.184458,-0.088331,-2.93435,2.018211,-0.424773,-0.98204,2.491436,2.830323
1,0.661262,-0.022221,0.701204,1,1,6,1,3,1,0,...,3.345881,3.753292,-2.480381,-0.130896,3.346467,-0.215562,-2.631033,5.420425,-3.135974,-2.315062
2,0.631123,-0.134142,0.446785,4,5,8,2,12,2,5,...,1.771898,-1.217827,-3.449997,0.051796,-2.888746,1.031273,1.395283,2.036354,1.258633,-0.035212
3,0.631123,-0.032364,0.47255,2,2,8,2,3,1,5,...,3.35128,-0.324347,-0.175689,-0.723112,0.263789,0.058111,-1.188654,4.194355,-2.34575,-2.271207
4,0.448238,0.213353,0.845413,1,1,2,1,21,3,5,...,1.771229,1.437353,-2.060251,-2.487451,-1.431126,0.890898,1.632694,-1.510008,2.809458,1.2603


In [4]:
data_train.describe()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,...,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0,9229.0
mean,0.604134,0.341933,0.609391,8.091668,8.440351,7.918843,1.32322,8.263517,1.489435,11.160256,...,1.662473,1.389782,-2.0585,-1.254967,-1.237908,0.968281,0.827795,-0.033295,1.383989,1.515585
std,0.191985,0.220772,0.198965,9.293232,9.318164,3.501009,0.567586,4.366166,0.696936,12.921087,...,1.452381,1.786097,1.336226,1.787497,1.811845,1.249751,1.766223,1.934155,1.894641,1.847547
min,-0.395323,-0.521412,-0.403111,1.0,1.0,2.0,1.0,2.0,1.0,0.0,...,-7.283158,-7.640591,-7.264711,-6.575889,-6.069744,-7.012487,-5.480772,-6.454226,-4.068371,-7.241373
25%,0.51275,0.185815,0.508052,2.0,2.0,5.0,1.0,5.0,1.0,2.0,...,0.929347,0.56076,-2.862606,-2.200616,-2.353731,0.161223,-0.149668,-1.44757,-0.176657,0.377876
50%,0.64008,0.369255,0.64192,5.0,5.0,8.0,1.0,6.0,1.0,7.0,...,1.62779,1.385389,-1.931585,-0.945945,-1.301695,0.91402,1.087398,-0.415794,1.31307,1.478315
75%,0.738279,0.508387,0.751027,11.0,11.0,10.0,2.0,11.0,2.0,16.0,...,2.705416,2.780261,-1.009458,0.03656,-0.229155,1.837201,1.941379,1.04568,2.914556,2.305077
max,0.963431,0.906958,0.991722,87.0,88.0,34.0,6.0,33.0,6.0,79.0,...,4.972502,4.74782,2.092813,4.738568,5.421085,4.918968,5.832686,6.35085,6.357422,7.138523


In [5]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [6]:
model = Classifier()

In [7]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [8]:
pred = model.predict(X_test)

In [9]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.951823302697895
f1_1 : 0.8864827104435906
f1_a : 0.9191530065707427


In [10]:
Classifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,6421,244
true_1,406,2538
