In [4]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import os
from pairing import Reader, Extractor, Classifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data Loading and Preprocessing

In [5]:
train_ratio = 0.75

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

In [57]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename))
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

Extracting data: 100%|████████████████████████████████████████████████████████████| 2025/2025 [00:06<00:00, 289.46it/s]
Extracting data: 100%|██████████████████████████████████████████████████████████████| 675/675 [00:02<00:00, 292.41it/s]


In [58]:
data_train.head()

Unnamed: 0,dist_endpoint,dist_start,idf_aspect,idf_sentiment,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,position_sentiment,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,1,1,4.145886,2.121546,5,1,5,1,1,0,...,2.458015,1.628019,-0.734806,-0.996471,-1.02617,-0.442034,1.933959,3.023867,1.022098,-0.16321
1,4,4,2.797311,1.52089,4,1,6,1,4,0,...,0.854022,1.550181,-1.105873,-3.82473,-1.457078,-2.518814,-0.274611,2.614733,0.836454,-1.148345
2,2,2,2.797311,1.549511,4,1,6,1,4,2,...,0.96125,-0.195079,-1.953468,-2.425579,-2.192458,-1.851305,0.86455,3.782375,1.87804,-0.864187
3,1,1,2.797311,5.455679,4,1,7,1,4,5,...,-0.350989,-0.327322,-0.13373,0.348254,-0.438743,-2.399245,4.210409,0.970092,2.60419,0.79928
4,1,1,2.853791,4.903294,8,1,16,3,0,1,...,-1.451063,-2.099121,0.424242,0.397704,-1.733288,-2.104239,-0.76279,2.260427,1.181861,0.635228


In [59]:
data_train.describe()

Unnamed: 0,dist_endpoint,dist_start,idf_aspect,idf_sentiment,len_aspect_char,len_aspect_word,len_sentiment_char,len_sentiment_word,position_aspect,position_sentiment,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,...,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0,14154.0
mean,8.350431,8.701922,4.055253,3.67608,7.890914,1.328741,8.21259,1.494772,11.323371,12.343507,...,0.191561,-0.266816,-0.931811,-0.310977,-1.234158,-1.73006,0.202764,1.614409,0.623943,-0.677299
std,9.363235,9.3889,1.912575,1.775126,3.61413,0.572539,4.226567,0.682166,12.662116,12.893933,...,1.980592,1.889491,1.406831,2.069538,1.575546,1.468785,1.686926,1.645082,1.670651,1.072304
min,1.0,1.0,1.388456,1.52089,2.0,1.0,2.0,1.0,0.0,0.0,...,-6.440172,-5.198436,-6.352921,-6.131666,-4.76335,-6.430819,-7.003192,-4.657427,-5.751933,-5.082765
25%,2.0,2.0,2.804533,2.206143,5.0,1.0,5.0,1.0,2.0,3.0,...,-1.28885,-1.577179,-1.418306,-1.541537,-2.243035,-2.562598,-0.764046,0.530461,-0.392449,-1.215277
50%,5.0,5.0,3.706714,3.325519,8.0,1.0,6.0,1.0,7.0,8.0,...,0.53931,-0.167105,-0.794217,-0.351841,-1.352996,-1.851305,-0.160534,1.964699,0.76015,-0.822427
75%,11.0,12.0,5.172738,4.442605,10.0,2.0,11.0,2.0,16.0,18.0,...,1.365762,1.017796,-0.038788,0.936765,-0.239676,-0.895733,1.171662,3.023867,1.767821,-0.088497
max,87.0,88.0,11.369182,11.369182,34.0,6.0,29.0,6.0,79.0,88.0,...,6.758648,7.202626,4.012051,7.205663,3.164414,8.79932,4.77892,5.664126,7.608002,6.568245


In [60]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

In [61]:
model = Classifier()

In [62]:
model.fit(X_train, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [63]:
pred = model.predict(X_test)

In [64]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.9572307692307692
f1_1 : 0.9041379310344827
f1_a : 0.930684350132626


In [65]:
model_pairing_filename = "pairing_final.pkl"

model.save(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))

In [66]:
Classifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,3111,97
true_1,181,1311
