In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.insert(0, '..')

import definition
from ner import Reader, Extractor, BiLstmCrfTagger, UnaryBatchGenerator
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score

Using TensorFlow backend.


## Data Loading and Preprocessing

In [2]:
train_ratio = 0.75

raw_data = Reader.read_file(definition.DATA_PAIRED_SAMPLE)
index = np.arange(len(raw_data))
np.random.shuffle(index)
index_train = index[:int(train_ratio * len(raw_data))]
index_test = index[int(train_ratio * len(raw_data)):]
raw_data_train = np.array(raw_data)[index_train].tolist()
raw_data_test = np.array(raw_data)[index_test].tolist()

In [17]:
print(len(raw_data_train))
print(len(raw_data_test))

2023
675


In [3]:
extractor = Extractor(embedding_filename=definition.MODEL_EMBEDDING_FASTTEXT)
X_train, y_train = extractor.extract_data(raw_data_train)
X_test, y_test = extractor.extract_data(raw_data_test)

Extracting data: 100%|██████████████████████████████████████████████████████████| 2023/2023 [00:00<00:00, 63255.85it/s]


Encoding model not found. It will be generated.


Extracting data: 100%|████████████████████████████████████████████████████████████| 675/675 [00:00<00:00, 56280.92it/s]


In [4]:
extractor.save_encoder_model(definition.MODEL_ENCODING_FILE)

In [18]:
X_train[0].shape

(25, 3137)

In [20]:
np.array(y_train[0]).shape

(25, 5)

## Training

In [7]:
n_features = X_train[0].shape[1]
n_lstm_unit = 100
n_distributed_dense = 50
n_tags = 5

print(n_features, n_lstm_unit, n_distributed_dense, n_tags)

3137 100 50 5


In [8]:
model = BiLstmCrfTagger(n_features=n_features, n_lstm_unit=n_lstm_unit, n_distributed_dense=n_distributed_dense, n_tags=n_tags)

In [9]:
train_generator = UnaryBatchGenerator(X_train, y_train)

In [10]:
history = model.fit_generator(train_generator, epochs=2, verbose=2)

Epoch 1/2
 - 163s - loss: 0.4658 - crf_accuracy: 0.8180
Epoch 2/2
 - 161s - loss: 0.1274 - crf_accuracy: 0.9039


In [11]:
pd.DataFrame(history.history)

Unnamed: 0,loss,crf_accuracy
0,0.465845,0.81802
1,0.127354,0.903864


In [12]:
model.save(definition.MODEL_NER_KERAS)

In [13]:
pred = []
for item in tqdm(X_test):
    pred.append(model.predict(np.array([item]))[0])

100%|████████████████████████████████████████████████████████████████████████████████| 675/675 [00:18<00:00, 36.73it/s]


In [14]:
pred_label = extractor.convert_y_matrix_to_label(pred).tolist()
y_test_label = extractor.convert_y_matrix_to_label(y_test).tolist()

In [15]:
pred_label_flatten = [token for sentence in pred_label for token in sentence]
y_test_label_flatten = [token for sentence in y_test_label for token in sentence]

In [16]:
print(classification_report(y_test_label_flatten, pred_label_flatten))

              precision    recall  f1-score   support

    B-ASPECT       0.90      0.78      0.84      1283
 B-SENTIMENT       0.89      0.88      0.89      1708
    I-ASPECT       0.90      0.58      0.70       411
 I-SENTIMENT       0.81      0.79      0.80       810
           O       0.90      0.95      0.92      6515

   micro avg       0.89      0.89      0.89     10727
   macro avg       0.88      0.80      0.83     10727
weighted avg       0.89      0.89      0.89     10727

