In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

from pairing import Reader, Extractor, Classifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix



## Data Loading and Preprocessing

In [2]:
train_ratio = 0.5

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

extractor = Extractor(embedding_filename=definition.MODEL_EMBEDDING_FASTTEXT)
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:04<00:00, 299.46it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:04<00:00, 286.92it/s]


In [3]:
data_train.head()

Unnamed: 0,aspect_v_0,aspect_v_1,aspect_v_10,aspect_v_11,aspect_v_12,aspect_v_13,aspect_v_14,aspect_v_15,aspect_v_16,aspect_v_17,...,sentiment_v_23,sentiment_v_24,sentiment_v_3,sentiment_v_4,sentiment_v_5,sentiment_v_6,sentiment_v_7,sentiment_v_8,sentiment_v_9,target
0,4.518739,5.140706,-1.709704,3.558405,2.087172,4.737096,8.623319,-2.612631,-1.138445,-0.352253,...,0.671508,-4.401276,1.001777,-0.536606,-0.528474,2.313473,1.958483,-0.285796,0.096482,1
1,4.518739,5.140706,-1.709704,3.558405,2.087172,4.737096,8.623319,-2.612631,-1.138445,-0.352253,...,0.626209,-3.365917,-1.535994,-2.533818,3.18808,2.585541,-1.638265,2.279348,1.455464,0
2,4.518739,5.140706,-1.709704,3.558405,2.087172,4.737096,8.623319,-2.612631,-1.138445,-0.352253,...,3.571545,-1.746683,-3.991367,-1.021086,1.261697,3.318293,-2.018394,3.366771,5.014288,0
3,4.518739,5.140706,-1.709704,3.558405,2.087172,4.737096,8.623319,-2.612631,-1.138445,-0.352253,...,3.571545,-1.746683,-3.991367,-1.021086,1.261697,3.318293,-2.018394,3.366771,5.014288,0
4,4.518739,5.140706,-1.709704,3.558405,2.087172,4.737096,8.623319,-2.612631,-1.138445,-0.352253,...,4.694343,-4.625687,-0.414016,-0.25588,0.000949,1.852765,-0.281702,2.119982,2.0539,0


In [4]:
data_train.describe()

Unnamed: 0,aspect_v_0,aspect_v_1,aspect_v_10,aspect_v_11,aspect_v_12,aspect_v_13,aspect_v_14,aspect_v_15,aspect_v_16,aspect_v_17,...,sentiment_v_23,sentiment_v_24,sentiment_v_3,sentiment_v_4,sentiment_v_5,sentiment_v_6,sentiment_v_7,sentiment_v_8,sentiment_v_9,target
count,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,...,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0,9432.0
mean,-0.452653,2.039111,0.347584,-0.424099,-1.201977,1.369375,-0.00918,0.847347,0.993974,-0.68298,...,1.420679,-2.061183,-1.369546,-1.287396,0.914829,0.865794,-0.044571,1.461967,1.544672,0.31531
std,1.658077,1.57272,1.708965,1.759354,2.621908,2.416546,1.787312,1.508585,3.574155,1.401256,...,1.798067,1.346033,1.883941,1.797078,1.297967,1.771814,1.949164,1.91401,1.838502,0.464664
min,-6.932917,-4.843931,-8.301933,-5.501853,-11.905328,-7.022311,-5.840508,-5.338267,-4.936593,-6.540143,...,-7.640591,-7.264711,-6.575889,-6.069744,-7.012487,-5.480772,-6.398162,-4.068371,-8.615383,0.0
25%,-1.254877,1.029764,-0.63644,-1.058019,-3.199651,0.47839,-0.787917,0.021839,-1.606518,-1.485871,...,0.574984,-2.862606,-2.572148,-2.338209,0.035077,-0.112871,-1.527333,-0.135777,0.37817,0.0
50%,-0.586115,2.145031,0.407763,-0.306977,-1.272447,1.640943,-0.405791,0.895471,0.264418,-0.713285,...,1.385389,-1.92915,-1.084937,-1.389245,0.91402,1.087398,-0.428858,1.419308,1.51183,0.0
75%,-0.049754,3.390692,1.414494,0.609607,0.635776,3.016767,0.556449,1.990324,2.056653,0.208115,...,2.868825,-1.216745,-0.01065,-0.283596,1.819064,2.007489,1.04568,3.079263,2.433072,1.0
max,4.518739,6.77362,4.358629,4.720596,5.110315,6.697991,8.623319,7.468032,13.862546,3.46823,...,4.74782,2.568834,4.738568,5.421085,5.558224,5.832686,6.35085,5.845491,7.138523,1.0


In [5]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [6]:
model = Classifier()

In [7]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [8]:
pred = model.predict(X_test)

In [9]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.9524610210944665
f1_1 : 0.8914106145251396
f1_a : 0.921935817809803


In [10]:
Classifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,6231,230
true_1,392,2553
