In [58]:
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.data_utils import get_file
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,GlobalAveragePooling1D,Lambda,Bidirectional
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam, RMSprop
from keras import backend as K
from keras.layers.embeddings import Embedding

In [12]:
data=pd.read_csv('data/quora/quora_duplicate_questions.tsv', sep='\t')
data['question1'] = data['question1'].apply(lambda x: (str(x)))
data['question2'] = data['question2'].apply(lambda x: (str(x)))
data.head()
data['question1'] = data['question1'].apply(lambda x: (str(x)))
data['question2'] = data['question2'].apply(lambda x: (str(x)))
target=data['is_duplicate']
question1 = list(data['question1'])
question2 = list(data['question2'])

In [81]:
data[50:100]

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
50,50,101,102,Is Career Launcher good for RBI Grade B prepar...,How is career launcher online program for RBI ...,1
51,51,103,104,Will a Blu Ray play on a regular DVD player? I...,How can you play a Blu Ray DVD on a regular DV...,1
52,52,105,106,Nd she is always sad?,Aerodynamically what happens when propellor ro...,0
53,53,107,108,What is the best/most memorable thing you've e...,What is the most delicious dish you've ever ea...,1
54,54,109,110,How GST affects the CAs and tax officers?,Why can't I do my homework?,0
55,55,111,112,How difficult is it get into RSI?,Do you apply for programs like RSI when you're...,0
56,56,113,114,Who is israil friend?,Is my boyfriend lying about his true feelings ...,0
57,57,115,116,What are some good rap songs to dance to?,What are some of the best rap songs?,0
58,58,117,118,I was suddenly logged off Gmail. I can't remem...,I can't remember my Gmail password or my recov...,1
59,59,119,120,What are the best ways to learn French?,How do I learn french genders?,0


In [15]:
tokenizer = Tokenizer(num_words=200000)
tokenizer.fit_on_texts(question1+question2)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index

In [47]:
labels = np.array(target, dtype=int)

In [87]:
len(labels) - np.sum(labels)

255027

In [17]:
q1_data = pad_sequences(question1_word_sequences, maxlen=25)
q2_data = pad_sequences(question2_word_sequences, maxlen=25)

In [41]:
embeddings_index = {}
f = open('data/glove/glove.840B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    try:
        coefs = np.asarray(values[1:], dtype='float32')
    except:
        print()
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))





















Found 2195884 word vectors.


In [42]:
iterator = iter(embeddings_index.keys())
for i in range(3):
    print(next(iterator))
    
embeddings_index

,
.
the


In [52]:
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        #print(word)
        embedding_matrix[i] = embedding_vector
    else:
        pass
        #print('Found')

In [None]:
np.save(open('q1_train.npy', 'wb'), q1_data)
np.save(open('q2_train.npy', 'wb'), q2_data)
np.save(open('label_train.npy', 'wb'), labels)
np.save(open('word_embedding_matrix.npy', 'wb'), embedding_matrix)

In [53]:
q1_data = np.load(open('q1_train.npy', 'rb'))
q2_data = np.load(open('q2_train.npy', 'rb'))

labels = np.load(open('label_train.npy', 'rb'))
embedding_matrix = np.load(open('word_embedding_matrix.npy', 'rb'))

In [54]:
from sklearn.cross_validation import train_test_split

X = np.stack((q1_data, q2_data), axis=1)
target = labels

X_train, X_val, y_train, y_val = train_test_split(X, target, test_size=0.25, random_state=126, stratify=target)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
Q1_val = X_val[:,0]
Q2_val = X_val[:,1]



In [56]:
def vec_distance(vects):
    x, y = vects
    return K.sum(K.square(x - y), axis=1, keepdims=True)
def vec_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

In [61]:
nb_words=95597
max_sentence_len=25
embedding_layer = Embedding(nb_words,300,
        weights=[embedding_matrix],
        input_length=max_sentence_len,trainable=False)

In [62]:
lstm_layer =LSTM(128)

sequence_1_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(max_sentence_len,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

distance=Lambda(vec_distance, output_shape=vec_output_shape)([x1, y1])
dense1=Dense(16, activation='sigmoid')(distance)
dense1 = Dropout(0.3)(dense1)

bn2 = BatchNormalization()(dense1)
prediction=Dense(1, activation='sigmoid')(bn2)

model = Model(input=[sequence_1_input, sequence_2_input], output=prediction)



In [63]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 25, 300)      28679100    input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   (None, 128)          219648      embedding_3[0][0]                
          

In [65]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
early_stopping =EarlyStopping(monitor='val_loss', patience=3)

In [72]:
import math

# labels_dict : {ind_label: count_label}
# mu : parameter to tune 

def create_class_weight(labels_dict, mu=2):
    total = np.sum(labels_dict.values())
    print(total)
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
        temp = labels_dict[key]
        print(temp)
        score = math.log(mu * total/float(temp))
        class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

In [None]:
unique, counts = np.unique(target, return_counts=True)
labels_dict=dict(zip(unique, counts))

target_weight=create_class_weight(labels_dict)
target_weight

In [74]:
hist=model.fit([Q1_train, Q2_train], y_train, validation_data=([Q1_val, Q2_val], y_val), verbose=1, 
          nb_epoch=10, batch_size=256, shuffle=True,class_weight=None, callbacks=[early_stopping])

  


Train on 303217 samples, validate on 101073 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
