In [57]:
from keras import utils as np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from keras.layers import Embedding, Merge, LSTM, GRU, Reshape
from keras.layers import Input, Dense, Dropout, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras.layers.core import Flatten
import pickle
from keras.models import Sequential
import os
from keras.engine.topology import Layer

In [60]:
f=open('data/reduced_data.pkl','rb')
data=pickle.load(f)
f.close()

In [61]:
df=pd.DataFrame(data,columns=['data','labels'])

In [62]:
data=df['data'].tolist()
labels=df['labels'].tolist()

# Tokenize the data (strings)

In [63]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# Turn target variables to numerical

In [64]:
dict1={}
inc=0
for i in range(0,len(labels)):
    if(labels[i] in dict1.keys()):
        labels[i]=dict1[labels[i]]
    else:
        inc+=1
        dict1[labels[i]]=inc
        labels[i]=dict1[labels[i]]

In [65]:
data=np.array([np.array(xi) for xi in sequences])

In [66]:
data = pad_sequences(sequences, maxlen=81)

In [67]:
labels = np_utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (222768, 81)
Shape of label tensor: (222768, 14)


# Divide in train and validation set

In [68]:
VALIDATION_SPLIT=0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [69]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [70]:
number_of_samples = 4
number_of_blocks_train = int(x_train.shape[0] / number_of_samples)
number_of_blocks_val = int(x_val.shape[0] / number_of_samples)
x_train_new = x_train[ : number_of_blocks_train * number_of_samples]
y_train_new = y_train[ : number_of_blocks_train * number_of_samples]
x_val_new = x_val[ : number_of_blocks_val * number_of_samples]
y_val_new = y_val[ : number_of_blocks_val * number_of_samples]

In [71]:
x_train_reshaped = x_train_new.reshape(number_of_blocks_train,number_of_samples,81)
y_train_reshaped = y_train_new.reshape(number_of_blocks_train,number_of_samples,14)

x_val_reshaped = x_val_new.reshape(number_of_blocks_val,number_of_samples,81)
y_val_reshaped = y_val_new.reshape(number_of_blocks_val,number_of_samples,14)

In [34]:
GLOVE_DIR='glove/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [72]:
embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
null_vectors=0
total=0
for word, i in word_index.items():
    total+=1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_vectors+=1

In [73]:
hidden_dim = 128
word_index = tokenizer.word_index
n_sentences = x_train_reshaped.shape[1]  # number of sentences in a sample (n)
n_words = x_train_reshaped.shape[2]  # number of words in a sentence
embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix], input_length = n_words * n_sentences )

# Sentence level RNN

In [74]:
sequence_input = Input(shape=(n_sentences, n_words), dtype='int32')
sequence_input_r = Reshape((n_sentences * n_words,),input_shape = (n_sentences, n_words))(sequence_input)
embedded_sequences = embedding_layer(sequence_input_r)
embedded_sequences_r = Reshape((n_sentences, n_words, embedding_dim))(embedded_sequences)
conv1d_3 = TimeDistributed(Conv1D(filters = 200, kernel_size = 3, activation = 'relu'))(embedded_sequences_r)
max_3 = TimeDistributed(GlobalMaxPooling1D()) (conv1d_3)                         
conv1d_4 = TimeDistributed(Conv1D(filters = 200, kernel_size = 4, activation = 'relu'))(embedded_sequences_r)
max_4 = TimeDistributed(GlobalMaxPooling1D()) (conv1d_4)                                 
conv1d_5 = TimeDistributed(Conv1D(filters = 200, kernel_size = 5, activation = 'relu'))(embedded_sequences_r)
max_5 = TimeDistributed(GlobalMaxPooling1D()) (conv1d_5)
convs = []
convs.append(max_3)
convs.append(max_4)
convs.append(max_5)
l_merge = Merge(mode='concat')(convs)
lstm_out = GRU(hidden_dim, return_sequences=True)(l_merge)
preds = TimeDistributed(Dense(14,activation='softmax'))(lstm_out)

  from ipykernel import kernelapp as app


In [75]:
model = Model(sequence_input, preds)
model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
model.fit(x_train_reshaped, y_train_reshaped, validation_data=(x_val_reshaped, y_val_reshaped), epochs=1, batch_size=32)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 4, 81)        0                                            
__________________________________________________________________________________________________
reshape_13 (Reshape)            (None, 324)          0           input_7[0][0]                    
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 324, 300)     5978700     reshape_13[0][0]                 
__________________________________________________________________________________________________
reshape_14 (Reshape)            (None, 4, 81, 300)   0           embedding_4[0][0]                
__________________________________________________________________________________________________
time_distr

<keras.callbacks.History at 0x261386a1860>