In [49]:
from keras import utils as np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from keras.layers import Embedding, Merge, LSTM, GRU
from keras.layers import Input, Dense, Dropout, Reshape
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras.layers.core import Flatten
import pickle
from keras.models import Sequential
import os
import tensorflow as tf

In [50]:
f=open('data/reduced_data.pkl','rb')
data=pickle.load(f)
f.close()

In [51]:
df=pd.DataFrame(data,columns=['data','labels'])

In [52]:
data=df['data'].tolist()
labels=df['labels'].tolist()

# Tokenize the data (strings)

In [53]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# Turn target variables to numerical

In [54]:
dict1={}
inc=0
for i in range(0,len(labels)):
    if(labels[i] in dict1.keys()):
        labels[i]=dict1[labels[i]]
    else:
        inc+=1
        dict1[labels[i]]=inc
        labels[i]=dict1[labels[i]]

In [55]:
data=np.array([np.array(xi) for xi in sequences])

In [56]:
data = pad_sequences(sequences, maxlen = 81)

In [57]:
labels = np_utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (222768, 81)
Shape of label tensor: (222768, 14)


# Divide in train and validation set

In [58]:
VALIDATION_SPLIT=0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [59]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

# Get the word vectors (Glove)

In [60]:
GLOVE_DIR='glove/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


# Change the data sentences to word vectors

In [61]:
EMBEDDING_DIM=300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
null_vectors=0
total=0
for word, i in word_index.items():
    total+=1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_vectors+=1

In [62]:
MAX_SEQUENCE_LENGTH=81
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [63]:
convs = []
filter_sizes = [3,4,5]
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
for fsz in filter_sizes:
    l_conv = Conv1D(nb_filter=200,filter_length=fsz,activation='relu')(embedded_sequences)
    l_pool = GlobalMaxPooling1D()(l_conv)
    convs.append(l_pool)
l_merge = Merge(mode='concat')(convs)
l_merge_r = Reshape((600,1))(l_merge)
lstm_out = GRU(128) (l_merge_r)
preds = Dense(14, activation='softmax')(lstm_out)

  
  
  
  if __name__ == '__main__':


In [64]:
model = Model(sequence_input, preds)
model.compile(optimizer='adagrad', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=1, batch_size=32)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_6 (InputLayer)            (None, 81)           0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 81, 300)      5978700     input_6[0][0]                    
__________________________________________________________________________________________________
conv1d_16 (Conv1D)              (None, 79, 200)      180200      embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_17 (Conv1D)              (None, 78, 200)      240200      embedding_4[0][0]                
__________________________________________________________________________________________________
conv1d_18 

<keras.callbacks.History at 0x1cee3480080>