In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

from pairing import Reader, Extractor, Classifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix



## Data Loading and Preprocessing

In [2]:
train_ratio = 0.5

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

In [3]:
embedding_filename = "fasttext_25.bin"

extractor = Extractor(embedding_filename=os.path.join(definition.MODEL_EMBEDDING, embedding_filename))
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:10<00:00, 132.14it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1349/1349 [00:10<00:00, 107.67it/s]


In [4]:
data_train.head()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0.691922,0.443068,0.775641,5,5,5,2,6,1,5,...,1.28203,0.749835,-1.889201,-1.477875,-2.330542,0.91402,1.087398,1.04568,-0.176657,2.305077
1,0.691922,0.304233,0.684315,3,3,5,2,6,1,5,...,2.705416,-0.870577,-3.812967,0.03656,-2.704789,2.313134,1.14655,0.759748,1.157153,1.478315
2,0.691922,0.406546,0.825912,1,2,5,2,12,2,5,...,2.058678,3.987616,-3.744146,-2.200616,-0.822563,0.304405,0.882733,2.137296,3.383741,2.765986
3,0.691922,0.179718,0.471781,5,6,5,2,5,1,5,...,1.613102,2.087269,-1.92915,-5.534593,-1.989089,0.906808,0.283228,1.993028,2.833731,0.779062
4,0.483977,0.287031,0.319313,1,1,5,1,4,1,0,...,4.294828,2.002966,-0.984168,-0.58778,0.854104,1.837201,-2.010706,-0.428858,0.446776,0.608101


In [5]:
data_train.describe()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_sentiment_sentence,dist_endpoint,dist_start,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,...,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0,9522.0
mean,0.603811,0.342966,0.607748,8.347406,8.71214,7.844465,1.335644,8.273682,1.513127,11.563957,...,1.662615,1.428748,-2.056948,-1.391571,-1.271013,0.946903,0.859722,-0.041043,1.457537,1.555126
std,0.1926,0.222255,0.206352,9.431196,9.471476,3.521158,0.57458,4.239877,0.704157,13.279269,...,1.390745,1.780819,1.343076,1.848271,1.813085,1.280624,1.751615,1.935287,1.907427,1.854362
min,-0.378395,-0.521412,-0.315304,1.0,1.0,2.0,1.0,2.0,1.0,0.0,...,-3.302528,-7.640591,-7.264711,-6.575889,-6.069744,-4.93902,-4.367845,-6.454226,-3.135974,-8.615383
25%,0.503224,0.189571,0.50959,2.0,2.0,5.0,1.0,5.0,1.0,2.0,...,0.952697,0.596575,-2.862606,-2.521194,-2.393117,0.161223,-0.087299,-1.562338,-0.135239,0.365948
50%,0.639481,0.374986,0.641629,5.0,5.0,7.0,1.0,7.0,1.0,7.0,...,1.635064,1.385389,-1.927913,-1.084937,-1.368273,0.91402,1.087398,-0.415794,1.361953,1.574045
75%,0.740811,0.508186,0.753564,11.0,12.0,10.0,2.0,11.0,2.0,16.0,...,2.705416,2.868825,-1.217198,-0.056839,-0.25588,1.837201,2.046253,1.04568,3.051475,2.349275
max,0.974018,0.906958,0.986976,81.0,81.0,29.0,5.0,28.0,6.0,81.0,...,6.298478,4.74782,2.568834,4.738568,6.924547,5.558224,5.832686,6.35085,6.240314,7.138523


In [6]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [7]:
model = Classifier()

In [8]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [9]:
pred = model.predict(X_test)

In [10]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.951857585139319
f1_1 : 0.8911064425770309
f1_a : 0.921482013858175


In [11]:
model.save(definition.MODEL_PAIRING_SAMPLEFILE)

In [12]:
Classifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,6149,260
true_1,362,2545
