In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
from pairing import Reader, Extractor, GBClassifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix

## Data Loading and Preprocessing

In [2]:
train_ratio = 0.75

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

In [3]:
print(len(raw_data))
print(len(raw_data_train))
print(len(raw_data_test))

3504
2628
876


In [4]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename))
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

Extracting data: 100%|█████████████████████████████████████████████████████████████| 2628/2628 [00:28<00:00, 91.38it/s]
Extracting data: 100%|███████████████████████████████████████████████████████████████| 876/876 [00:10<00:00, 87.34it/s]


In [5]:
print('TOTAL', '[+]', '[-]', sep='\t')
print(len(data_train), len(data_train[data_train['target']==1]), len(data_train[data_train['target']==0]), sep='\t')
print(len(data_test), len(data_test[data_test['target']==1]), len(data_test[data_test['target']==0]), sep='\t')

TOTAL	[+]	[-]
18417	5785	12632
6462	1942	4520


In [6]:
data_train.head()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,dist_endpoint,dist_start,id_sentence,idf_aspect,idf_sentiment,len_aspect_char,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0.443822,0.298354,1,0.540095,1,2,1,3.314935,4.395943,9,...,-2.203774,-4.344076,4.012051,0.783933,-2.05429,-1.234127,-0.754624,3.932025,-2.572457,-2.025486
1,0.443822,-0.140978,1,0.347339,7,8,1,3.314935,3.290396,9,...,2.662194,4.456686,-0.346311,0.535727,0.365226,0.446712,3.81956,3.2129,0.653219,0.586194
2,0.812196,-0.055786,1,0.328193,3,4,1,4.217195,4.395943,16,...,-1.312234,-1.745488,-1.533929,1.279536,0.149017,-2.779657,-0.714473,0.008457,-1.77474,-1.2963
3,0.645552,0.036263,1,0.347339,1,2,1,4.217195,3.290396,16,...,2.662194,4.456686,-0.346311,0.535727,0.365226,0.446712,3.81956,3.2129,0.653219,0.586194
4,0.600134,0.087632,1,0.479475,9,9,3,3.966281,3.330025,17,...,1.886635,0.667913,-3.039409,2.095493,-0.684248,0.244786,0.716682,2.76125,4.262498,-0.904011


In [7]:
data_train.describe()

Unnamed: 0,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,dist_endpoint,dist_start,id_sentence,idf_aspect,idf_sentiment,len_aspect_char,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,...,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0,18417.0
mean,0.5563,0.224229,1.0,0.519889,8.480697,8.840311,1289.589618,4.04779,3.745156,7.980018,...,0.17317,-0.193202,-1.134902,-0.259238,-1.22808,-1.788514,0.280478,1.363642,0.802932,-0.645202
std,0.197179,0.218764,0.0,0.196877,9.711128,9.752334,761.859551,1.915014,1.820039,3.582425,...,2.240272,1.968863,2.208318,2.426007,1.809454,1.78087,1.916952,2.182419,2.068242,1.301259
min,-0.406663,-0.61522,1.0,-0.61415,1.0,1.0,1.0,1.388456,1.52089,2.0,...,-6.830917,-7.01447,-6.352921,-5.474275,-7.845984,-6.430819,-7.311463,-11.645122,-8.311359,-7.254439
25%,0.449906,0.074771,1.0,0.422395,2.0,2.0,611.0,2.80377,2.218872,5.0,...,-1.374574,-1.49734,-1.95206,-2.2021,-2.192458,-2.523826,-0.928411,-0.00435,-0.218355,-1.364622
50%,0.600144,0.234023,1.0,0.551552,5.0,5.0,1309.0,3.706714,3.330025,8.0,...,0.628506,0.212176,-1.105873,-0.560764,-1.328322,-1.851305,0.339579,1.704419,0.99863,-0.864187
75%,0.700164,0.38212,1.0,0.654537,11.0,12.0,1957.0,5.172738,4.577961,10.0,...,1.619385,1.322042,-0.016912,0.803365,0.042179,-0.618254,1.497273,3.032854,1.959383,0.380631
max,0.932394,0.859089,1.0,0.956427,89.0,90.0,2627.0,11.369182,11.369182,34.0,...,6.758648,5.128863,4.012051,7.205663,5.160348,7.809184,7.665266,6.8899,7.608002,6.568245


In [8]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [9]:
model = GBClassifier()

In [10]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
pred = model.predict(X_test)

In [12]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.9545703338806786
f1_1 : 0.8904724201636316
f1_a : 0.9225213770221551


In [13]:
model_pairing_filename = "pairing_final.pkl"

model.save(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))

In [15]:
GBClassifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,4360,160
true_1,255,1687
