In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

from pairing import Reader, Extractor, Classifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix



## Data Loading and Preprocessing

In [2]:
train_ratio = 0.5

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

extractor = Extractor(embedding_filename=definition.MODEL_EMBEDDING_FASTTEXT)
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:10<00:00, 172.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:10<00:00, 133.52it/s]


In [3]:
data_train.head()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0.605622,0.397063,0.965333,1,1,8,1,9,2,0,...,0.758534,0.654685,-2.786533,-0.036265,-1.938957,1.241433,1.657082,0.146149,-1.639964,0.268669
1,0.819192,0.485816,0.638946,1,1,5,1,6,1,0,...,1.28203,0.749835,-1.889201,-1.477875,-2.330542,0.91402,1.087398,1.04568,-0.176657,2.305077
2,0.819192,0.473005,0.747654,6,6,5,1,6,1,0,...,2.705416,-0.870577,-3.812967,0.03656,-2.704789,2.313134,1.14655,0.759748,1.157153,1.478315
3,0.819192,0.522568,0.709603,16,16,5,1,9,1,0,...,0.67639,0.574984,-1.677302,-1.716099,-1.470744,2.189908,1.132772,-0.985259,2.145703,0.365948
4,0.819192,0.641294,0.732357,19,19,5,1,13,2,0,...,1.330207,1.533208,-2.638434,0.126818,-1.998133,0.16495,0.465589,0.299282,1.553331,0.973103


In [4]:
data_train.describe()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,...,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0,9408.0
mean,0.605863,0.337599,0.603001,8.353316,8.706526,8.026892,1.342474,8.144664,1.489158,11.211841,...,1.693936,1.393683,-2.025261,-1.321155,-1.258043,0.972081,0.793642,0.013151,1.422546,1.501862
std,0.194791,0.223198,0.206512,9.648613,9.674388,3.654271,0.590898,4.206306,0.683205,12.959705,...,1.415581,1.77647,1.354861,1.847395,1.814365,1.251679,1.744974,1.937166,1.899684,1.855098
min,-0.378395,-0.521412,-0.403111,1.0,1.0,2.0,1.0,2.0,1.0,0.0,...,-7.283158,-5.795476,-7.264711,-6.575889,-6.069744,-7.012487,-5.480772,-6.454226,-4.068371,-8.615383
25%,0.51394,0.183554,0.502273,2.0,2.0,5.0,1.0,5.0,1.0,2.0,...,1.024802,0.50823,-2.853587,-2.481988,-2.395923,0.161223,-0.169278,-1.44757,-0.166117,0.319599
50%,0.644277,0.367614,0.635291,5.0,5.0,8.0,1.0,6.0,1.0,7.0,...,1.713377,1.385389,-1.906032,-1.084937,-1.362475,0.91402,1.087398,-0.335888,1.310623,1.478315
75%,0.741748,0.504387,0.749122,11.0,12.0,10.0,2.0,11.0,2.0,16.0,...,2.705416,2.821995,-1.08138,0.007812,-0.242517,1.837201,1.92772,1.04568,3.021916,2.305077
max,0.981345,0.906958,0.986976,87.0,88.0,29.0,5.0,30.0,6.0,79.0,...,4.972502,4.74782,2.568834,4.738568,6.924547,5.206373,5.832686,6.35085,6.357422,7.138523


In [5]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [6]:
model = Classifier()

In [7]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [8]:
pred = model.predict(X_test)

In [9]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.9523012552301255
f1_1 : 0.8902887139107611
f1_a : 0.9212949845704433


In [10]:
model.save(definition.MODEL_PAIRING_SAMPLEFILE)

In [11]:
Classifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,6259,255
true_1,372,2544
