In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from utils import *
import pickle
%load_ext autoreload
%autoreload 2


### Read the training data into a pandas dataframe

In [2]:
data = pd.read_csv('../data/quora/train.csv')

### Show the first  five rows of the data

In [3]:
data.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


### Show count of classes in the data

In [7]:
data['target'][data['target']==0].count(),data['target'][data['target']==1].count()

(1225312, 80810)

Clearly the class 0 data is way much than the class 1 labeled data. 
To avoid the problem we will undersample class 0 labelled data to the number of class 1 data. 
Another way would be to oversample class1 data but this would result to overfitting class 1 data

In [10]:
class_1_data=data[data['target']==1]
class_0_data=data[data['target']==0]

In [14]:
class_0_data_under_sampled=class_0_data.sample(n=80810)

In [17]:
class_0_data_under_sampled.head()

Unnamed: 0,qid,question_text,target
1110275,d9902b76531003fe88ee,"What is the relationship between pressure, den...",0
59055,0b9576490062c24c6377,Do you think Pakistan will survive as a countr...,0
878491,ac1cd40adb781ef99e7b,Do you agree with New Orleans removing Confede...,0
363986,475971cb42846eaf75fe,When did white people become so afraid of blac...,0
760545,9503071e231bbb3df348,What are all the lies that Donald Trump has told?,0


we can now view the number of items in the sampled dataframe

In [18]:
class_0_data_under_sampled.count()

qid              80810
question_text    80810
target           80810
dtype: int64

Concatenate the two dataframes

In [20]:
concatenated_data=class_0_data_under_sampled.append(class_1_data)

In [22]:
concatenated_data.head()

Unnamed: 0,qid,question_text,target
1110275,d9902b76531003fe88ee,"What is the relationship between pressure, den...",0
59055,0b9576490062c24c6377,Do you think Pakistan will survive as a countr...,0
878491,ac1cd40adb781ef99e7b,Do you agree with New Orleans removing Confede...,0
363986,475971cb42846eaf75fe,When did white people become so afraid of blac...,0
760545,9503071e231bbb3df348,What are all the lies that Donald Trump has told?,0


In [23]:
concatenated_data.tail()

Unnamed: 0,qid,question_text,target
1306093,fffeba722d9b371bd1b9,How is it to have intimate relation with your ...,1
1306094,fffee269360dd0d3947a,Why is it when singers have lyrics about voice...,1
1306099,ffff0e4ea1bb6e16feec,Do pakis smell of curry and shit?,1
1306103,ffff3f0a2449ffe4b9ff,Isn't Trump right after all? Why should the US...,1
1306112,ffffa5b0fa76431c063f,Are you ashamed of being an Indian?,1


Since all the rows containing data for the label 0 are the first   rows . We shuffle the data

In [24]:
concatenated_data=concatenated_data.sample(frac=1).reset_index(drop=True)

In [26]:
concatenated_data.head(10)

Unnamed: 0,qid,question_text,target
0,bd08123b2817e6fd3c98,"Why is everybody fussed about the US, the UK a...",1
1,995833b2734d05695b5d,What jobs does most imigrants from Punjab (Ind...,0
2,9a82a7548b1e053f50fd,What is the name of the movie in the below men...,1
3,11b81f27151c5351ba80,"Should I be ashamed of my ""white"" skin and the...",1
4,244535d7949c96ffbff1,Why does the Secretary of Defense Mattis say t...,1
5,0507cd0157ac7cd73c34,Why did the US never have a super battleship l...,1
6,de933bb220398d605181,"Can you finish this statement? ""blacks are mor...",1
7,6c71ad8f332dbd42c15a,What is the best strategy to crack NEET with 5...,0
8,dd4f70056d743ff16efc,How is a wasp nest made?,0
9,d43703ed2d8a0468de35,Why are Indians so good at discrimination on t...,1


In [28]:
concatenated_data.count()

qid              161620
question_text    161620
target           161620
dtype: int64

### Read the glove vectors

In [27]:
word_to_index, index_to_word, word_to_vec_map = read_glove_6B_vecs('../glove/glove.6B.300d.txt')

## for use with  840B glove vectors

In [29]:
"""
word_to_index= pickle.load(open( "../glove/word_to_index.pickle", "rb" ))
index_to_word=pickle.load(open( "../glove/index_to_word.pickle", "rb" ))
word_to_vec_map= pickle.load(open( "../glove/word_to_vec_map.pickle", "rb" ))
"""

'\nword_to_index= pickle.load(open( "../glove/word_to_index.pickle", "rb" ))\nindex_to_word=pickle.load(open( "../glove/index_to_word.pickle", "rb" ))\nword_to_vec_map= pickle.load(open( "../glove/word_to_vec_map.pickle", "rb" ))\n'

In [30]:
len(word_to_index), len(word_to_vec_map)

(400000, 400000)

 Get the maximum count of words in a string in the pandas dataframe . using this value we will initialize the parameters to the embeddding layer

In [31]:
max_len=concatenated_data['question_text'].str.split().str.len().max()+1 

convert X (array of sentences as strings) into an array of indices corresponding to words in the sentences. The output shape should be such that it can be given to `Embedding()`

In [32]:
X=np.array(concatenated_data['question_text'][:10000])
Y=np.array(concatenated_data['target'][:10000])

In [33]:
X.shape

(10000,)

In [34]:
X_indices=sentences_to_indices(X, word_to_index, max_len)

In [51]:
def model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating the Emojify-v2 model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    #
    # Define sentence_indices as the input of the graph, it should be of shape input_shape and dtype 'int32' (as it contains indices).
    sentence_indices = tf.keras.layers.Input(shape=(input_shape),dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer, you get back the embeddings
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a batch of sequences.
    X = tf.keras.layers.LSTM(128, return_sequences=True,recurrent_initializer='glorot_uniform',kernel_initializer='glorot_uniform')(embeddings)
    # Add dropout with a probability of 0.5
    X = tf.keras.layers.Dropout(0.5)(X)
    # CONV layer 
    X = tf.keras.layers.Conv1D(196, 15, strides=1,kernel_initializer='glorot_uniform')(X)                                 # CONV1D
    X = tf.keras.layers.BatchNormalization()(X)                                 # Batch normalization
    X = tf.keras.layers.Activation(activation='relu')(X)                                # ReLu activation
    X = tf.keras.layers.Dropout(0.8)(X)        
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # Be careful, the returned output should be a single hidden state, not a batch of sequences.
    X = tf.keras.layers.LSTM(128,return_sequences=False,recurrent_initializer='glorot_uniform',kernel_initializer='glorot_uniform')(X)
    # Add dropout with a probability of 0.5
    X = tf.keras.layers.Dropout(0.5)(X)
    # Propagate X through a Dense layer with sigmoid activation to get back a batch of 5-dimensional vectors.
    X = tf.keras.layers.Dense(1, activation='sigmoid',kernel_initializer='glorot_uniform')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = tf.keras.Model(inputs=sentence_indices,outputs=X)
    
    ### END CODE HERE ###
    
    return model

In [52]:
keras_model = model((max_len,), word_to_vec_map, word_to_index)
keras_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 65)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 65, 300)           120000300 
_________________________________________________________________
lstm_2 (LSTM)                (None, 65, 128)           219648    
_________________________________________________________________
dropout_3 (Dropout)          (None, 65, 128)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 51, 196)           376516    
_________________________________________________________________
batch_normalization_1 (Batch (None, 51, 196)           784       
_________________________________________________________________
activation_1 (Activation)    (None, 51, 196)           0         
__________

In [53]:
keras_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [61]:
keras_model.fit(X_indices, Y, epochs = 5, batch_size = 128, shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe06a6c66a0>

In [72]:
X_test=np.array(data['question_text'][1100000:1200000])
Y_test=np.array(data['target'][1100000:1200000])

In [73]:
X_test_indices=sentences_to_indices(X_test, word_to_index, max_len)

In [74]:
preds=keras_model.predict(X_test_indices)

In [75]:
from sklearn.metrics import accuracy_score

In [76]:
accuracy_score(Y_test,np.round(preds))

0.84271