In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
train_df = pd.read_pickle('./train_seq_df.pkl')
dev_df = pd.read_pickle('./dev_seq_df.pkl')
test_df = pd.read_pickle('./test_seq_df.pkl')

In [3]:
train_df.head()

Unnamed: 0,product,label
0,apple watch,brand category
1,ipad,category
2,apple watch series 3,brand category modelname modelname
3,apple watch series 2,brand category modelname modelname
4,apple homepod,brand category


In [4]:
train_sentences = []
words = set()
for i, row in train_df.iterrows():
    sent = list(zip(row['product'].split(), row['label'].split()))
    train_sentences += [sent]
    words.update(row['product'].split())
    
dev_sentences = []
for i, row in dev_df.iterrows():
    sent = list(zip(row['product'].split(), row['label'].split()))
    dev_sentences += [sent]
    words.update(row['product'].split())
    
test_sentences = []
for i, row in test_df.iterrows():
    sent = list(zip(row['product'].split(), row['label'].split()))
    test_sentences += [sent]
    words.update(row['product'].split())
    
words = list(words)
tags = ['other', 'category', 'modelname', 'brand']

n_tags = len(tags)
n_words = len(words) # vocabulary size
n_tags, n_words

(4, 837)

In [5]:
train_sentences[:5]

[[('apple', 'brand'), ('watch', 'category')],
 [('ipad', 'category')],
 [('apple', 'brand'),
  ('watch', 'category'),
  ('series', 'modelname'),
  ('3', 'modelname')],
 [('apple', 'brand'),
  ('watch', 'category'),
  ('series', 'modelname'),
  ('2', 'modelname')],
 [('apple', 'brand'), ('homepod', 'category')]]

In [6]:
MAX_LEN = 7  # Max length of review (in words)

# Vocabulary Key:word -> Value:token_index
# The first 2 entries are reserved for PAD and UNK
word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["UNK"] = 1 # Unknown words
word2idx["PAD"] = 0 # Padding

# Vocabulary Key:token_index -> Value:word
idx2word = {i: w for w, i in word2idx.items()}

# Vocabulary Key:Label/Tag -> Value:tag_index
# The first entry is reserved for PAD
tag2idx = {t: i+1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

In [7]:
# Convert each sentence from list of Token to list of word_index
X_train = [[word2idx[w[0]] for w in s] for s in train_sentences]
X_train = pad_sequences(maxlen=MAX_LEN, sequences=X_train, padding="post", value=word2idx["PAD"])
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_train = pad_sequences(maxlen=MAX_LEN, sequences=y_train, padding="post", value=tag2idx["PAD"])

X_dev = [[word2idx[w[0]] for w in s] for s in dev_sentences]
X_dev = pad_sequences(maxlen=MAX_LEN, sequences=X_dev, padding="post", value=word2idx["PAD"])
y_dev = [[tag2idx[w[1]] for w in s] for s in dev_sentences]
y_dev = pad_sequences(maxlen=MAX_LEN, sequences=y_dev, padding="post", value=tag2idx["PAD"])

X_test = [[word2idx[w[0]] for w in s] for s in test_sentences]
X_test = pad_sequences(maxlen=MAX_LEN, sequences=X_test, padding="post", value=word2idx["PAD"])
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]
y_test = pad_sequences(maxlen=MAX_LEN, sequences=y_test, padding="post", value=tag2idx["PAD"])

# One-Hot encode
y_train = [to_categorical(i, num_classes=n_tags+1) for i in y_train]  # n_tags+1(PAD)
y_dev = [to_categorical(i, num_classes=n_tags+1) for i in y_dev]  # n_tags+1(PAD)
y_test = [to_categorical(i, num_classes=n_tags+1) for i in y_test]  # n_tags+1(PAD)

In [8]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from keras_contrib.layers.crf import CRF

BATCH_SIZE = 16  # Number of examples used in each iteration
EPOCHS = 15  # Number of passes through entire dataset
EMBEDDING = 35  # Dimension of word embedding vector
DROPOUT = 0.2

# Model definition
input = Input(shape=(MAX_LEN,))
model = Embedding(input_dim=n_words+2, output_dim=EMBEDDING, # n_words + 2 (PAD & UNK)
                  input_length=MAX_LEN, mask_zero=True)(input)  # default: 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
                           recurrent_dropout=DROPOUT))(model)  # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
crf = CRF(n_tags+1)  # CRF layer, n_tags+1(PAD)
out = crf(model)  # output

model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
# model.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where





In [11]:
history = model.fit(X_train, np.array(y_train), batch_size=BATCH_SIZE, epochs=20,
                    validation_data=(X_dev, np.array(y_dev)), verbose=2)
# Eval
pred_cat = model.predict(X_test)
pred = np.argmax(pred_cat, axis=-1)
y_test_true = np.argmax(y_test, -1)

Train on 554 samples, validate on 139 samples
Epoch 1/20
 - 0s - loss: 4.1777 - crf_viterbi_accuracy: 0.9950 - val_loss: 11.7647 - val_crf_viterbi_accuracy: 0.4646
Epoch 2/20
 - 0s - loss: 4.1749 - crf_viterbi_accuracy: 0.9949 - val_loss: 12.3182 - val_crf_viterbi_accuracy: 0.4587
Epoch 3/20
 - 0s - loss: 4.1730 - crf_viterbi_accuracy: 0.9964 - val_loss: 12.6766 - val_crf_viterbi_accuracy: 0.4536
Epoch 4/20
 - 0s - loss: 4.1720 - crf_viterbi_accuracy: 0.9963 - val_loss: 12.6073 - val_crf_viterbi_accuracy: 0.4564
Epoch 5/20
 - 0s - loss: 4.1692 - crf_viterbi_accuracy: 0.9992 - val_loss: 13.1513 - val_crf_viterbi_accuracy: 0.4538
Epoch 6/20
 - 0s - loss: 4.1685 - crf_viterbi_accuracy: 0.9993 - val_loss: 13.5974 - val_crf_viterbi_accuracy: 0.4538
Epoch 7/20
 - 0s - loss: 4.1670 - crf_viterbi_accuracy: 1.0000 - val_loss: 13.7103 - val_crf_viterbi_accuracy: 0.4650
Epoch 8/20
 - 0s - loss: 4.1666 - crf_viterbi_accuracy: 1.0000 - val_loss: 13.8139 - val_crf_viterbi_accuracy: 0.4707
Epoch 9/20

In [10]:
from sklearn_crfsuite.metrics import flat_classification_report

pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_test_true_tag = [[idx2tag[i] for i in row] for row in y_test_true] 

report = flat_classification_report(y_pred=pred_tag, y_true=y_test_true_tag)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

         PAD       1.00      1.00      1.00       337
       brand       0.92      0.44      0.59        25
    category       0.39      1.00      0.56        67
   modelname       0.71      0.21      0.32        58
       other       0.00      0.00      0.00        52

    accuracy                           0.79       539
   macro avg       0.60      0.53      0.49       539
weighted avg       0.79      0.79      0.76       539

