In [1]:
import numpy as np
import pickle

import data.load
from metrics.accuracy import conlleval

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Convolution1D, MaxPooling1D, Flatten
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import progressbar

import gensim
word2vector = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

def w2v(w):
    if w in word2vector:
        return word2vector[w]
    else:
        return np.zeros([300,])

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# https://deeplearning4j.org/word2vec.html
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
# http://ahogrammer.com/2017/01/20/the-list-of-pretrained-word-embeddings/

### Load Data
train_set, valid_set, dicts = data.load.atisfull()
w2idx, ne2idx, labels2idx = dicts['words2idx'], dicts['tables2idx'], dicts['labels2idx']

idx2w  = {w2idx[k]:k for k in w2idx}
idx2ne = {ne2idx[k]:k for k in ne2idx}
idx2la = {labels2idx[k]:k for k in labels2idx}

train_x, train_ne, train_label = train_set

words_train = [ list(map(lambda x: idx2w[x], w)) for w in train_x]

x_train = np.array([])
bar = progressbar.ProgressBar(max_value=len(words_train))
for sample in bar(words_train):
    sentence = np.array([])
    for word in sample:
        sentence = np.append(sentence, w2v(word))
    x_train = np.append(x_train, sentence)

x_train = [np.array(list(map(lambda x: w2v(x), w))) for w in words_train]
groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in train_label]

val_x, val_ne, val_label = valid_set

words_val = [ list(map(lambda x: idx2w[x], w)) for w in val_x]
groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in val_label]

100% (4978 of 4978) |#####################| Elapsed Time: 0:04:12 Time: 0:04:12


In [None]:
# Define model
import keras

model = Sequential()
model.add(word2vector.get_keras_embedding(False))
model.add(Convolution1D(64,5,padding='same', activation='relu'))
model.add(Dropout(0.25))
model.add(GRU(300,return_sequences=True))
model.add(TimeDistributed(Dense(127, activation='softmax')))
model.add(MaxPooling1D(64*4)) # maybe this works?
model.compile('rmsprop', 'categorical_crossentropy')


In [None]:
sent = x_train[0] #x_train is the train sentences encoded using word2vec

pred = model.predict_on_batch(sent)
#pred = np.array(list(map(lambda x: x[0] ,pred)))

In [108]:
pred.shape

(18, 127)

In [120]:
### Training
n_epochs = 1
n_classes = 127
train_f_scores = []
val_f_scores = []
best_val_f1 = 0

In [116]:
np.argmax(pred,-1)

array([111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111, 111,
       111, 111, 111, 111, 111])

In [117]:
pred.shape

(18, 127)

### Next steps from the source file

In [124]:
for i in range(n_epochs):
    print("Epoch {}".format(i))

    print("Training =>")
    train_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(x_train))
    for n_batch, sent in bar(enumerate(x_train)):
        label = train_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        print(label.shape)
        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.array(list(map(lambda x: x[0] ,pred)))
        pred = np.argmax(pred,-1)
        train_pred_label.append(pred)
    avgLoss = avgLoss/n_batch

    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    #                    guess(labels)     testY(labels)   words input[]
    con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))


    print("Validating =>")

    val_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(val_x))


    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]

        if sent.shape[1] > 1: #some bug in keras
            loss = model.test_on_batch(sent, label)
            avgLoss += loss

        print('sent')
        for w in sent[0]:
            # print('w')
            # print(w)
            # print('w')
            print(idx2w[w], end=' ')
        print('sent')
        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        print('pred')
        for w in pred:
            print(idx2la[w], end=' ')
        print('pred')
        val_pred_label.append(pred)

    avgLoss = avgLoss/n_batch

    predword_val = [ list(map(lambda x: idx2la[x], y)) for y in val_pred_label]
    con_dict = conlleval(predword_val, groundtruth_val, words_val, 'r.txt')
    val_f_scores.append(con_dict['f1'])

    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    if con_dict['f1'] > best_val_f1:
    	best_val_f1 = con_dict['f1']
    	open('model_architecture.json','w').write(model.to_json())
    	model.save('best_model.h5',overwrite=True)
    	print("Best validation F1 score = {}".format(best_val_f1))
    print()


N/A% (0 of 4978) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

Epoch 0
Training =>
(1, 18, 127)


ValueError: Input arrays should have the same number of samples as target arrays. Found 18 input samples and 1 target samples.