In [35]:
from keras import utils as np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from keras.layers import Embedding, Merge, LSTM, GRU, Reshape
from keras.layers import Input, Dense, Dropout, TimeDistributed
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras.layers.core import Flatten
import pickle
from keras.models import Sequential
import os
from keras.engine.topology import Layer

In [68]:
f=open('data/reduced_data.pkl','rb')
data=pickle.load(f)
f.close()

In [69]:
df=pd.DataFrame(data,columns=['data','labels'])

In [70]:
data=df['data'].tolist()
labels=df['labels'].tolist()

# Tokenize the data (strings)

In [56]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# Turn target variables to numerical

In [57]:
dict1={}
inc=0
for i in range(0,len(labels)):
    if(labels[i] in dict1.keys()):
        labels[i]=dict1[labels[i]]
    else:
        inc+=1
        dict1[labels[i]]=inc
        labels[i]=dict1[labels[i]]

In [58]:
data=np.array([np.array(xi) for xi in sequences])

In [59]:
data = pad_sequences(sequences, maxlen=81)

In [60]:
labels = np_utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (222768, 81)
Shape of label tensor: (222768, 14)


# Divide in train and validation set

In [61]:
VALIDATION_SPLIT=0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [62]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [63]:
number_of_samples = 4
number_of_blocks_train = int(x_train.shape[0] / number_of_samples)
number_of_blocks_val = int(x_val.shape[0] / number_of_samples)
x_train_new = x_train[ : number_of_blocks_train * number_of_samples]
y_train_new = y_train[ : number_of_blocks_train * number_of_samples]

x_val_new = x_val[ : number_of_blocks_val * number_of_samples]
y_val_new = y_val[ : number_of_blocks_val * number_of_samples]

In [64]:
x_train_reshaped = x_train_new.reshape(number_of_blocks_train,number_of_samples,81)
y_train_reshaped = y_train_new.reshape(number_of_blocks_train,number_of_samples,14)

x_val_reshaped = x_val_new.reshape(number_of_blocks_val,number_of_samples,81)
y_val_reshaped = y_val_new.reshape(number_of_blocks_val,number_of_samples,14)

# Sentence level RNN

In [65]:
embedding_dim = 300
hidden_dim = 128
word_index = tokenizer.word_index
n_sentences = x_train_reshaped.shape[1]  # number of sentences in a sample (n)
n_words = x_train_reshaped.shape[2]  # number of words in a sentence

model = Sequential()
model.add(Reshape((n_sentences * n_words,),input_shape = (n_sentences, n_words)))
model.add(Embedding(len(word_index) + 1, embedding_dim, input_length = n_words * n_sentences ))
model.add(Reshape((n_sentences, n_words, embedding_dim))) 
model.add(TimeDistributed(GRU(hidden_dim)))
model.add(GRU(hidden_dim, return_sequences=True))
model.add(TimeDistributed(Dense(14,activation='softmax')))

In [66]:
model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
model.fit(x_train_reshaped, y_train_reshaped, validation_data=(x_val_reshaped, y_val_reshaped), epochs=1, batch_size=64)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_13 (Reshape)         (None, 324)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 324, 300)          5978700   
_________________________________________________________________
reshape_14 (Reshape)         (None, 4, 81, 300)        0         
_________________________________________________________________
time_distributed_11 (TimeDis (None, 4, 128)            164736    
_________________________________________________________________
gru_6 (GRU)                  (None, 4, 128)            98688     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 4, 14)             1806      
Total params: 6,243,930
Trainable params: 6,243,930
Non-trainable params: 0
_________________________________________________________________


<keras.callbacks.History at 0x2a61a6132b0>