In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from utils import *
import pickle
%load_ext autoreload
%autoreload 2


### Read the training data into a pandas dataframe

In [2]:
data = pd.read_csv('../data/quora/train.csv')

### Show the first  five rows of the data

In [3]:
data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


### Show a description of the data

In [4]:
data.describe()

Unnamed: 0,target
count,1306122.0
mean,0.06187018
std,0.2409197
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


### Read the glove vectors

In [5]:
word_to_index, index_to_word, word_to_vec_map = read_glove_6B_vecs('../glove/glove.6B.300d.txt')

## for use with  840B glove vectors

In [6]:
"""
word_to_index= pickle.load(open( "../glove/word_to_index.pickle", "rb" ))
index_to_word=pickle.load(open( "../glove/index_to_word.pickle", "rb" ))
word_to_vec_map= pickle.load(open( "../glove/word_to_vec_map.pickle", "rb" ))
"""

'\nword_to_index= pickle.load(open( "../glove/word_to_index.pickle", "rb" ))\nindex_to_word=pickle.load(open( "../glove/index_to_word.pickle", "rb" ))\nword_to_vec_map= pickle.load(open( "../glove/word_to_vec_map.pickle", "rb" ))\n'

In [7]:
len(word_to_index), len(word_to_vec_map)

(400000, 400000)

 Get the maximum count of words in a string in the pandas dataframe . using this value we will initialize the parameters to the embeddding layer

In [8]:
max_len=data['question_text'].str.split().str.len().max()+1 

convert X (array of sentences as strings) into an array of indices corresponding to words in the sentences. The output shape should be such that it can be given to `Embedding()`

In [9]:
X=np.array(data['question_text'][:100000])
Y=np.array(data['target'][:100000])

In [10]:
X.shape

(100000,)

In [11]:
X_indices=sentences_to_indices(X, word_to_index, max_len)

In [14]:
def model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    #
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = tf.keras.layers.Input(shape=(input_shape),dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = tf.keras.layers.LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = tf.keras.layers.Dropout(0.5)(X)
    # CONV layer 
    X = tf.keras.layers.Conv1D(196, 15, strides=1)(X)                                 # CONV1D
    X = tf.keras.layers.BatchNormalization()(X)                                 # Batch normalization
    X = tf.keras.layers.Activation(activation='relu')(X)                                # ReLu activation
    X = tf.keras.layers.Dropout(0.8)(X)        
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = tf.keras.layers.LSTM(128,return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = tf.keras.layers.Dropout(0.5)(X)
    # Propagate X through a Dense layer with sigmoid activation to get back a batch of 5-dimensional vectors.
    X = tf.keras.layers.Dense(1, activation='sigmoid')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = tf.keras.Model(inputs=sentence_indices,outputs=X)
    
    ### END CODE HERE ###
    
    return model

In [15]:
keras_model = model((max_len,), word_to_vec_map, word_to_index)
keras_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 135)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 135, 300)          120000300 
_________________________________________________________________
lstm_1 (LSTM)                (None, 135, 128)          219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 135, 128)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 121, 196)          376516    
_________________________________________________________________
batch_normalization (BatchNo (None, 121, 196)          784       
_________________________________________________________________
activation (Activation)      (None, 121, 196)          0         
__________

In [17]:
keras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
keras_model.fit(X_indices, Y, epochs = 2, batch_size = 256, shuffle=True)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f1c640f0550>

In [22]:
X_test=np.array(data['question_text'][111000:112000])
Y_test=np.array(data['target'][111000:112000])

In [23]:
X_test_indices=sentences_to_indices(X_test, word_to_index, max_len)

In [24]:
preds=keras_model.predict(X_test_indices)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
accuracy_score(Y_test,np.round(preds))

0.934

In [31]:
data[data['target']==0].count()

qid              1225312
question_text    1225312
target           1225312
dtype: int64