In [1]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense 
from keras.layers import Flatten 
from keras.layers import Embedding

import pandas as pd

Using TensorFlow backend.


In [2]:
import os

cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in '%s': %s" % (cwd, files))

Files in '/Users/alket/Desktop/DeepLearningKerasQuoraChallenge': ['.DS_Store', 'Train2.csv', 'TestGloveEmbeddings.ipynb', '.ipynb_checkpoints', 'glove.6B', 'dati_test.csv']


In [14]:
# define data sources 
#data = pd.read_csv('Train2.csv', nrows=70000)
data = pd.read_csv('dati_test.csv')
data.head(2)
data.shape

(213, 3)

In [15]:
docs = data['question_text'].tolist()
lab = data['target'].tolist()
labels = array(lab)

In [16]:
# define the tokenizer and set vocabulary size 
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1

In [17]:
# integer encode the documents 
encoded_docs = t.texts_to_sequences(docs)
print(encoded_docs)

# pad documents to a max length of 100 words
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post') 
print(padded_docs)

[[60, 1, 376, 129, 96, 1, 202, 377, 8, 1, 48], [67, 130, 10, 49, 378, 2, 20, 379, 380, 131, 130, 27, 381, 131, 130], [26, 382, 203, 204, 205, 4, 383, 384, 14, 385, 6, 55, 23, 386, 206], [12, 207, 208, 209, 4, 12, 82, 24, 387, 209, 97, 5, 388, 56, 12, 389, 57, 11, 2, 7, 97, 5, 390, 4, 12, 35, 2, 132, 50, 391, 11, 40, 12, 7], [67, 392, 19, 1, 393, 394], [6, 7, 395, 133, 396, 210], [15, 7, 12, 397, 44, 98, 211, 14, 3, 212, 398, 15, 99, 7, 23, 213], [6, 7, 100, 134, 399, 1, 400, 401, 30, 8, 214, 17, 135, 32, 101, 215, 402, 136, 216, 403, 404, 405, 2, 406, 217, 202, 137], [41, 407, 408, 409, 3, 410, 411, 18, 50, 412, 2, 413, 1, 414, 102, 415], [8, 11, 218, 60, 206, 416, 9, 417, 418, 138, 219, 4, 11, 419, 19, 23, 83, 420, 1, 421, 9, 129, 2, 139, 20, 68, 220], [61, 100, 422, 2, 423, 8, 1, 424, 221, 14, 1, 425, 9, 84, 4, 69, 42, 1, 222, 26, 23, 426, 16, 140, 141, 427, 84, 4, 97, 28, 7, 2, 100, 11, 1, 428, 35, 2, 7, 2, 1, 222], [6, 10, 103, 223, 429, 430, 4, 85, 431, 432, 2, 25, 433, 434, 142, 

In [18]:
# load the whole embedding into memory
embeddings_index = dict()

# put your path to glove here 
f = open('glove.6B/glove.6B.100d.txt', mode='rt', encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32') 
    embeddings_index[word] = coefs
f.close()

print('Loaded %s word vectors.' % len(embeddings_index)) 

Loaded 400000 word vectors.


In [19]:
# create a weight matrix for words in training docs 
embedding_matrix = zeros((vocab_size, 100))

for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [20]:
# define model
model = Sequential()
e = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=100, trainable=False) 
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [21]:
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [22]:
# summarize the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          136500    
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 10001     
Total params: 146,501
Trainable params: 10,001
Non-trainable params: 136,500
_________________________________________________________________


In [23]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.History at 0x12d1c0390>

In [24]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 99.061033
