In [1]:
from keras.models import Sequential
from keras.layers import Input, Dense, Dropout
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras import utils as np_utils
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Model
from keras.layers.core import Flatten
import pickle
import os

Using TensorFlow backend.


In [2]:
f=open('data/reduced_data.pkl','rb')
data=pickle.load(f)
f.close()

In [3]:
df=pd.DataFrame(data,columns=['data','labels'])

In [4]:
data=df['data'].tolist()
labels=df['labels'].tolist()

# Tokenize the data (strings)

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

# Find the median length of the data

In [6]:
count=[]
for sentence in data:
    words_in_sen=sentence.split(' ')
    count.append(len(words_in_sen))

In [7]:
median=np.median(np.array(count))

In [8]:
data = pad_sequences(sequences, maxlen=81)

# Turn target variables to numerical

In [9]:
dict1={}
inc=0
for i in range(0,len(labels)):
    if(labels[i] in dict1.keys()):
        labels[i]=dict1[labels[i]]
    else:
        inc+=1
        dict1[labels[i]]=inc
        labels[i]=dict1[labels[i]]

In [10]:
labels = np_utils.to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (222768, 81)
Shape of label tensor: (222768, 14)


# Divide in train and validation set

In [11]:
VALIDATION_SPLIT=0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [12]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

# Get the word vectors (Glove)

In [13]:
GLOVE_DIR='glove/'
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


# Change the data sentences to word vectors

In [14]:
EMBEDDING_DIM=300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
null_vectors=0
total=0
for word, i in word_index.items():
    total+=1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_vectors+=1

In [15]:
MAX_SEQUENCE_LENGTH=81
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [16]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
print(sequence_input.shape)
embedded_sequences = embedding_layer(sequence_input)
x = LSTM(128)(embedded_sequences)
preds = Dense(len(dict1)+1, activation='softmax')(x)

(?, 81)


In [17]:
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adagrad',
              metrics=['acc'])
print(model.summary())
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=1, batch_size=32)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 81)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 81, 300)           5978700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_1 (Dense)              (None, 14)                1806      
Total params: 6,200,154
Trainable params: 221,454
Non-trainable params: 5,978,700
_________________________________________________________________
None
Train on 178215 samples, validate on 44553 samples
Epoch 1/1
