In [1]:
import random
import os
import matplotlib.pyplot as plt
import argparse
import keras
from utils import *
from models.lstm_classifier import LSTMClassifier
from models.gru_classifier import GRUClassifier
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

## Train the model

In [3]:
train = load_data('data/utterances.train')
val = load_data('data/utterances.valid')

In [4]:
train = pd.concat([train, val]).sample(frac=1) # Merge train & val then shuffle
val_split = len(val) / len(train)
val_split

0.09237789951132092

In [5]:
test = load_data('data/utterances.test', train=False)

In [6]:
classes = train['dialog_act'].unique()
num_classes = len(classes)
max_len = train['utterance_t'].apply(len).max()

In [7]:
(X_train, y_train), (vocab, word_to_idx, idx_to_word) = process_data(train, max_len)

In [8]:
test_vocab = [word for row in test['utterance_t'] for word in row if word not in vocab]
for word in test_vocab:
    idx = len(word_to_idx)
    word_to_idx[word] = idx
    idx_to_word[idx] = word
    vocab.add(word)

In [9]:
weights = {}
with open('/projekte/slu/share/emb/glove.twitter.27B.50d.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        line = line.split()
        weights[line[0]] = np.array(line[1:], dtype='float32')
del weights['0.45973'] # Corrects an error in the file

In [10]:
# embds_path = '/projekte/slu/share/GoogleNews-vectors-negative300.bin' 
# weights = load_bin_vec(embds_path, word_to_idx)

In [11]:
num_dims = weights[random.choice(list(weights.keys()))].shape[0]
add_unknown_words(weights, word_to_idx, k=num_dims, min_df=0)
W, word_idx = get_W(weights, k=num_dims)

In [12]:
model = GRUClassifier(num_classes, max_len, W)
#model = LSTMClassifier(num_classes, max_len, W)

In [15]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 106, 50)           60195350  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               44160     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
__________

## Test the model

In [None]:
model = keras.models.load_model('lstm_classifier.h5')

In [None]:
%%time
history = model.fit(X_train, y_train, validation_split=val_split * 2,
                    epochs=3, batch_size=128)

Train on 176502 samples, validate on 40000 samples
Epoch 1/3

In [None]:
model.save('lstm_model_google.h5')

In [None]:
X_test = sequence.pad_sequences(test['utterance_t'].apply(process_seq, args=[word_idx]),
                                maxlen=max_len)

In [None]:
pred = model.predict_classes(X_test)
pred

In [None]:
classes

In [None]:
counts = np.unique(pred, return_counts=True)
for i,c in zip(counts[0], counts[1]):
    print(classes[i], c / len(pred))

In [None]:
(train.groupby('dialog_act').size() / len(train)).sort_values(ascending=False)

In [None]:
for p, c in zip(pred[:20], test['utterance_t'].iloc[:20]):
    print(classes[p], c)