In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
from pairing import Reader, Extractor, GBClassifier, FilteredGBClassifier
import definition
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, confusion_matrix

## Data Loading and Preprocessing

In [2]:
train_ratio = 0.75

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

In [3]:
print(len(raw_data))
print(len(raw_data_train))
print(len(raw_data_test))

3504
2628
876


In [4]:
embedding_filename = "fasttext_25.bin"
word_count_filename = "word_count_60.pkl"

extractor = Extractor(
    embedding_filename=os.path.join(definition.MODEL_UTILITY, embedding_filename), 
    word_count_filename=os.path.join(definition.MODEL_UTILITY, word_count_filename))
data_train = extractor.extract_data(raw_data_train)
data_test = extractor.extract_data(raw_data_test)

Extracting data: 100%|█████████████████████████████████████████████████████████████| 2628/2628 [00:29<00:00, 88.44it/s]
Extracting data: 100%|███████████████████████████████████████████████████████████████| 876/876 [00:09<00:00, 92.33it/s]


In [5]:
print('TOTAL', '[+]', '[-]', sep='\t')
print(len(data_train), len(data_train[data_train['target']==1]), len(data_train[data_train['target']==0]), sep='\t')
print(len(data_test), len(data_test[data_test['target']==1]), len(data_test[data_test['target']==0]), sep='\t')

TOTAL	[+]	[-]
18752	5781	12971
6137	1954	4183


In [6]:
data_train.head()

Unnamed: 0,_id_sentence,_n_aspect,_n_sentiment,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,dist_endpoint,dist_start,idf_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
0,0,3,4,0.522249,0.382649,1,0.379685,1,1,4.162805,...,1.303575,-1.536487,-0.60665,0.447448,-1.536233,-0.939863,2.483057,0.01669,2.806749,1.18629
1,0,3,4,0.522249,0.310525,1,0.522566,3,3,4.162805,...,0.854022,1.550181,-1.105873,-3.82473,-1.457078,-2.518814,-0.274611,2.614733,0.836454,-1.148345
2,0,3,4,0.522249,0.151783,1,0.297014,6,6,4.162805,...,1.886635,0.667913,-3.039409,2.095493,-0.684248,0.244786,0.716682,2.76125,4.262498,-0.904011
3,0,3,4,0.522249,0.454805,1,0.66446,13,13,4.162805,...,-1.289075,-2.976948,1.322789,-2.2381,0.394939,-4.074512,1.027948,3.130394,1.149124,0.812326
4,0,3,4,0.594557,0.215558,1,0.379685,4,4,5.716693,...,1.303575,-1.536487,-0.60665,0.447448,-1.536233,-0.939863,2.483057,0.01669,2.806749,1.18629


In [7]:
data_train.describe()

Unnamed: 0,_id_sentence,_n_aspect,_n_sentiment,cos_aspect_sentence,cos_aspect_sentiment,cos_aspect_sentiment_validity,cos_sentiment_sentence,dist_endpoint,dist_start,idf_aspect,...,v_sentiment_22,v_sentiment_23,v_sentiment_24,v_sentiment_3,v_sentiment_4,v_sentiment_5,v_sentiment_6,v_sentiment_7,v_sentiment_8,v_sentiment_9
count,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,...,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0,18752.0
mean,1324.921875,4.300768,5.064953,0.553917,0.22419,1.0,0.519554,8.816553,9.169689,4.057487,...,0.19249,-0.189377,-1.141774,-0.259911,-1.266133,-1.77243,0.28708,1.354594,0.789865,-0.656086
std,756.352515,2.485043,2.66417,0.198392,0.219089,0.0,0.195766,10.509091,10.537256,1.919359,...,2.248921,1.990986,2.22378,2.428824,1.82458,1.789753,1.934739,2.222122,2.070918,1.304187
min,0.0,1.0,1.0,-0.406663,-0.61522,1.0,-0.61415,1.0,1.0,1.388456,...,-6.830917,-7.578067,-13.775706,-7.413177,-7.845984,-6.430819,-7.311463,-11.645122,-8.311359,-7.254439
25%,700.0,2.0,3.0,0.44282,0.072584,1.0,0.42617,2.0,2.0,2.804533,...,-1.360366,-1.49734,-1.95206,-2.2021,-2.192458,-2.522924,-0.893943,-0.00435,-0.218355,-1.364622
50%,1359.0,4.0,5.0,0.599667,0.232521,1.0,0.551699,5.0,5.0,3.706714,...,0.649657,0.236934,-1.105873,-0.560764,-1.424769,-1.851305,0.339579,1.870267,0.95008,-0.864187
75%,1975.25,6.0,6.0,0.698616,0.383941,1.0,0.6532,12.0,12.0,5.172738,...,1.619385,1.329243,-0.016912,0.783933,0.020546,-0.563195,1.497273,3.032854,1.959383,0.380631
max,2626.0,12.0,14.0,0.924766,0.859089,1.0,0.956427,99.0,99.0,11.369182,...,6.758648,5.128863,4.012051,7.205663,11.117597,5.805314,7.665266,6.8899,7.608002,6.568245


In [8]:
X_train = data_train.drop('target', axis=1)
X_test = data_test.drop('target', axis=1)
y_train = data_train['target']
y_test = data_test['target']

## Training

TODO : drop dummy features

In [15]:
# model = GBClassifier()
# X_train_final = X_train.drop(labels=['_id_sentence', '_n_aspect', '_n_sentiment'], axis=1)
# X_test_final = X_test.drop(labels=['_id_sentence', '_n_aspect', '_n_sentiment'], axis=1)

model = FilteredGBClassifier()
X_train_final = X_train.drop(labels=['_id_sentence', '_n_aspect'], axis=1)
X_test_final = X_test.drop(labels=['_id_sentence', '_n_aspect'], axis=1)

In [16]:
model.fit(X_train_final, y_train)
model.model

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
pred = model.predict(X_test_final)

In [18]:
print("f1_0 : {}".format(f1_score(y_test, pred, pos_label=0)))
print("f1_1 : {}".format(f1_score(y_test, pred, pos_label=1)))
print("f1_a : {}".format(f1_score(y_test, pred, average='macro')))

f1_0 : 0.9559099437148217
f1_1 : 0.8996262680192205
f1_a : 0.9277681058670211


In [19]:
model_pairing_filename = "pairing_final.pkl"

model.save(os.path.join(definition.MODEL_PAIRING, model_pairing_filename))

In [20]:
GBClassifier.generate_confusion_matrix_table(y_test, pred)

Unnamed: 0,predicted_0,predicted_1
true_0,4076,107
true_1,269,1685
