# Imports

In [1]:
import numpy as np
import pickle
import data.load
from metrics.accuracy import conlleval
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN, GRU, LSTM
from keras.layers.core import Dense, Dropout
from keras.layers.wrappers import TimeDistributed
from keras.layers import Conv1D, Conv2D, MaxPooling1D, Flatten
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import progressbar
import gensim

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def w2v(w):
    if w in word2vector:
        return word2vector[w]
    else:
        return np.zeros([300,])

# Load data

#### This one loads the embeding model, takes a while

In [3]:
word2vector = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
word2vec_keras_layer = word2vector.get_keras_embedding(train_embeddings=False)

In [4]:
train_set, valid_set, dicts = data.load.atisfull()
w2idx, ne2idx, labels2idx = dicts['words2idx'], dicts['tables2idx'], dicts['labels2idx']

idx2w  = {w2idx[k]:k for k in w2idx}
idx2ne = {ne2idx[k]:k for k in ne2idx}
idx2la = {labels2idx[k]:k for k in labels2idx}

train_x, train_ne, train_label = train_set

words_train = [ list(map(lambda x: idx2w[x], w)) for w in train_x]

with open('x_train.pkl', 'rb') as f:
    x_train = np.array(pickle.load(f))[0]
    
# x_train = np.array([])
# bar = progressbar.ProgressBar(max_value=len(words_train))
# for sample in bar(words_train):
#   sentence = np.array([])
#   for word in sample:
#       sentence = np.append(sentence, w2v(word))
#   x_train = np.append(x_train, sentence)

# x_train = [np.array(list(map(lambda x: w2v(x), w))) for w in words_train]
groundtruth_train = [ list(map(lambda x: idx2la[x], y)) for y in train_label]

val_x, val_ne, val_label = valid_set

words_val = [ list(map(lambda x: idx2w[x], w)) for w in val_x]
groundtruth_val = [ list(map(lambda x: idx2la[x], y)) for y in val_label]

# Make the model

In [11]:
model = Sequential()
model.add(word2vec_keras_layer)
model.add(Conv2D(64,5,padding='same', activation='relu'))
model.add(MaxPooling1D(64*4)) # maybe this works?
model.add(Dropout(0.25))
model.add(GRU(300,return_sequences=True))
model.add(TimeDistributed(Dense(127, activation='softmax')))
model.compile('rmsprop', 'categorical_crossentropy')

ValueError: Input 0 is incompatible with layer conv2d_1: expected ndim=4, found ndim=3

In [7]:
x_train.shape

(4978,)

In [8]:
sent = x_train[0] #x_train is the train sentences encoded using word2vec

pred = model.predict_on_batch(sent)
print(pred.shape)
pred = np.array(list(map(lambda x: x[0] ,pred)))
print(pred.shape)
print(pred[0])

(18, 1, 127)
(18, 127)
[ 0.00787602  0.00787465  0.00787364  0.00787544  0.00787482  0.00787726
  0.0078731   0.00787254  0.00787361  0.0078729   0.00787324  0.0078715
  0.0078724   0.00787207  0.00787165  0.00787573  0.00787067  0.00787468
  0.00787634  0.0078729   0.00787395  0.00787238  0.00787589  0.00787564
  0.00787241  0.00787218  0.00787273  0.00787771  0.00787318  0.00787421
  0.00787464  0.00787514  0.00787446  0.00787196  0.00787427  0.00787152
  0.00787409  0.00787542  0.00787414  0.00787166  0.00787034  0.00787678
  0.00787363  0.0078737   0.00787488  0.00787728  0.00787272  0.00787308
  0.00787598  0.00787478  0.00787168  0.00787351  0.00787662  0.00787488
  0.00787298  0.00787655  0.00787842  0.00787261  0.00787485  0.00787336
  0.00787514  0.0078745   0.00787923  0.00787272  0.00786973  0.00787188
  0.00787394  0.00787456  0.00787625  0.00787171  0.00787308  0.00787261
  0.00787002  0.00787259  0.00787556  0.00787447  0.00787419  0.00787394
  0.00787419  0.00787745  0.0

In [9]:
### Training
n_epochs = 1
n_classes = 127
train_f_scores = []
val_f_scores = []
best_val_f1 = 0

In [10]:
print("Training =>")
train_pred_label = []
avgLoss = 0

bar = progressbar.ProgressBar(max_value=len(x_train))
for n_batch, sent in bar(enumerate(x_train)):
    label = train_label[n_batch]
    label = np.eye(n_classes)[label][np.newaxis,:][0]
    print(label.shape)
    print(sent.shape)
    if sent.shape[1] > 1: #some bug in keras
        loss = model.train_on_batch(sent, label)
        avgLoss += loss

    pred = model.predict_on_batch(sent)
    pred = np.array(list(map(lambda x: x[0] ,pred)))
    pred = np.argmax(pred,-1)
    train_pred_label.append(pred)
avgLoss = avgLoss/n_batch

N/A% (0 of 4978) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

Training =>
(18, 127)
(18, 300)


ValueError: Error when checking target: expected time_distributed_1 to have 3 dimensions, but got array with shape (18, 127)

### Next steps from the source file

In [124]:
for i in range(n_epochs):
    print("Epoch {}".format(i))

    print("Training =>")
    train_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(x_train))
    for n_batch, sent in bar(enumerate(x_train)):
        label = train_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        print(label.shape)
        if sent.shape[1] > 1: #some bug in keras
            loss = model.train_on_batch(sent, label)
            avgLoss += loss

        pred = model.predict_on_batch(sent)
        pred = np.array(list(map(lambda x: x[0] ,pred)))
        pred = np.argmax(pred,-1)
        train_pred_label.append(pred)
    avgLoss = avgLoss/n_batch

    predword_train = [ list(map(lambda x: idx2la[x], y)) for y in train_pred_label]
    #                    guess(labels)     testY(labels)   words input[]
    con_dict = conlleval(predword_train, groundtruth_train, words_train, 'r.txt')
    train_f_scores.append(con_dict['f1'])
    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))


    print("Validating =>")

    val_pred_label = []
    avgLoss = 0

    bar = progressbar.ProgressBar(max_value=len(val_x))


    for n_batch, sent in bar(enumerate(val_x)):
        label = val_label[n_batch]
        label = np.eye(n_classes)[label][np.newaxis,:]
        sent = sent[np.newaxis,:]

        if sent.shape[1] > 1: #some bug in keras
            loss = model.test_on_batch(sent, label)
            avgLoss += loss

        print('sent')
        for w in sent[0]:
            # print('w')
            # print(w)
            # print('w')
            print(idx2w[w], end=' ')
        print('sent')
        pred = model.predict_on_batch(sent)
        pred = np.argmax(pred,-1)[0]
        print('pred')
        for w in pred:
            print(idx2la[w], end=' ')
        print('pred')
        val_pred_label.append(pred)

    avgLoss = avgLoss/n_batch

    predword_val = [ list(map(lambda x: idx2la[x], y)) for y in val_pred_label]
    con_dict = conlleval(predword_val, groundtruth_val, words_val, 'r.txt')
    val_f_scores.append(con_dict['f1'])

    print('Loss = {}, Precision = {}, Recall = {}, F1 = {}'.format(avgLoss, con_dict['r'], con_dict['p'], con_dict['f1']))

    if con_dict['f1'] > best_val_f1:
    	best_val_f1 = con_dict['f1']
    	open('model_architecture.json','w').write(model.to_json())
    	model.save('best_model.h5',overwrite=True)
    	print("Best validation F1 score = {}".format(best_val_f1))
    print()


N/A% (0 of 4978) |                       | Elapsed Time: 0:00:00 ETA:  --:--:--

Epoch 0
Training =>
(1, 18, 127)


ValueError: Input arrays should have the same number of samples as target arrays. Found 18 input samples and 1 target samples.