In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Conv1D, Dropout, Embedding, Input, Concatenate
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv('../../datasets/quora/train_quora.csv')
test_df = pd.read_csv('../../datasets/quora/test_quora.csv')

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
pd.concat([train_df, test_df])[['question1', 'question2']].drop_duplicates().shape

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df = train_df.fillna('')
test_df = test_df.fillna('')

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
X_train = train_df[['question1', 'question2']]
y_train = train_df['is_duplicate']

X_test = test_df[['question1', 'question2']]

In [None]:
tokenizer = Tokenizer(num_words=1000)

all_texts = pd.concat(
    [
        X_train['question1'], 
        X_train['question2'],
        X_test['question1'], 
        X_test['question2']
    ]
)

print(all_texts.shape)

In [None]:
tokenizer.fit_on_texts(all_texts)

In [None]:
X_train_tokenized = pd.DataFrame({
    'question1': tokenizer.texts_to_sequences(X_train['question1']),
    'question2': tokenizer.texts_to_sequences(X_train['question2'])
})

In [None]:
X_train_tokenized.head()

In [None]:
question1_array_train = sequence.pad_sequences(X_train_tokenized['question1'], maxlen=200)
question2_array_train = sequence.pad_sequences(X_train_tokenized['question2'], maxlen=200)

question1_array_train

In [None]:
print(question1_array_train.shape)
print(question2_array_train.shape)

In [None]:
X_test_tokenized = pd.DataFrame({
    'question1': tokenizer.texts_to_sequences(X_test['question1']),
    'question2': tokenizer.texts_to_sequences(X_test['question2'])
})

question1_array_test = sequence.pad_sequences(X_test_tokenized['question1'], maxlen=200)
question2_array_test = sequence.pad_sequences(X_test_tokenized['question2'], maxlen=200)

In [None]:
print(question1_array_test.shape)
print(question2_array_test.shape)

In [None]:
question1_inp = Input(shape=(200,), name='q1_token_sequence')
emb_question1 = Embedding(1000, 64, input_length=200, name='q1_embedding')(question1_inp)
lstm_out_qst1 = LSTM(128, name='q1_lstm')(emb_question1)
dropout_qst1 = Dropout(0.25, name='q1_dropout')(lstm_out_qst1)

question2_inp = Input(shape=(200,), name='q2_token_sequence')
emb_question2 = Embedding(1000, 64, input_length=200, name='q2_embedding')(question2_inp)
lstm_out_qst2 = LSTM(128, name='q2_lstm')(emb_question2)
dropout_qst2 = Dropout(0.25, name='q2_dropout')(lstm_out_qst2)

concat = Concatenate(name= 'concatenation')([dropout_qst1, dropout_qst2])

dense_1 = Dense(32, activation='relu', name='dense_dim_reduction')(concat)

out = Dense(1, activation='sigmoid', name='dense_classification')(dense_1)

model = Model(inputs=[question1_inp, question2_inp], outputs=[out])

In [None]:
plot_model(model, show_shapes=True, rankdir='TB', dpi=300)

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy', 'accuracy'])

In [None]:
history = model.fit(
    [question1_array_train, question2_array_train], 
    y_train.values, 
    epochs=10, 
    batch_size=512, 
    validation_split=.2
)

In [None]:
model.evaluate([question1_array_test, question2_array_test], private_df.values[:, 0], batch_size=512)