In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.layers as layers
from sklearn.model_selection import train_test_split
from time import time
from os.path import abspath, join, pardir, dirname, lexists
from src.data_cleaning import *
import tensorflow as tf

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        print(e)

In [None]:
VOCAB_SIZE = 20000
OOV_TOKEN = '<OOV>'
TRUNCATING = 'pre'
PADDING = 'pre'
MAX_SENTENCE_LENGTH = 80

In [None]:
train_data = pd.read_csv('input/train.csv', index_col='id')

In [None]:
train_data.question1 = train_data.question1.astype(str)
train_data.question2 = train_data.question2.astype(str)

In [None]:
def get_overlapping_ratio(row):
    q1words = set()
    q2words = set()
    for word in row.question1.lower().split(' '):
        q1words.add(word)
    for word in row.question2.lower().split(' '):
        q2words.add(word)
    overlapping_words = q1words.intersection(q2words)
    return round(len(overlapping_words) * 2/(len(q1words) + len(q2words)), 4)

In [None]:
overlaps = train_data.apply(get_overlapping_ratio, axis=1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(15,10))
plt.hist(overlaps[train_data.is_duplicate == 0], bins=40, label='different')
plt.hist(overlaps[train_data.is_duplicate == 1], bins=40, alpha=0.7, label='duplicate')
plt.legend()
plt.xlabel('overlap ratio')
plt.ylabel('number of instances')

In [None]:
diffs = train_data.apply(lambda x: abs(len(x.question1) - len(x.question2))/max(len(x.question1),len(x.question2)),axis=1)

In [None]:
balanced_train = train_data.groupby('is_duplicate').apply(lambda x: x.sample(20000))

In [None]:
plt.figure(figsize=(15,10))

d1 = diffs[balanced_train.loc[0].index]
d2 = diffs[balanced_train.loc[1].index]
plt.hist(d1, bins=50, label='different')
plt.hist(d2, bins=50, alpha=0.7, label='duplicate')
plt.yscale('linear')
plt.legend()

In [None]:
def get_question_count(sentence):
    pass

In [None]:
from nltk.tokenize import sent_tokenize

q1_sentences = train_data['question1'].apply(sent_tokenize)
q2_sentences = train_data['question2'].apply(sent_tokenize)

In [None]:
len(q1_sentences), (q1_sentences.apply(len)).mean()

In [None]:
diffs[train_data.is_duplicate == 0].mean(), diffs[train_data.is_duplicate == 0].std()

In [None]:
diffs[train_data.is_duplicate == 1].mean(),diffs[train_data.is_duplicate == 1].std()

In [None]:
train_data = train_data.append({'qid1': 303951, 'qid2': 174363, 'question1': 'How can I create an Android app?',
                                'question2': 'How can I develop android app?', 'is_duplicate': 1}, ignore_index=True)
train_data = train_data.dropna(axis=0)

In [None]:
eng_stopwords = stopwords.words('english')

def remove_stopwords(sentence):
    return ' '.join([word for word in sentence.split(' ') if word not in eng_stopwords])

In [None]:
train_data.question1 = train_data.question1.apply(lambda x: remove_stopwords(x))
train_data.question2 = train_data.question2.apply(lambda x: remove_stopwords(x))

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train_data[['question1', 'question2']],
                                                      train_data['is_duplicate'])

In [None]:
print("training samples: {}, validation samples: {}".format(X_train.shape, X_valid.shape))


In [None]:
tokenizer = Tokenizer(oov_token=OOV_TOKEN, num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(X_train.question1.append(X_train.question2))

In [None]:
def convert_to_seqs(x):
    seqs = tokenizer.texts_to_sequences(x)
    return pad_sequences(seqs, maxlen=MAX_SENTENCE_LENGTH, truncating=TRUNCATING, padding=PADDING)

In [None]:
train_q1_sents = convert_to_seqs(X_train.question1.copy())
train_q2_sents = convert_to_seqs(X_train.question2.copy())
valid_q1_sents = convert_to_seqs(X_valid.question1.copy())
valid_q2_sents = convert_to_seqs(X_valid.question2.copy())

In [None]:
print("training input: {}, validation input: {}".format(train_q1_sents.shape, valid_q1_sents.shape))

In [None]:
if not lexists("model/saved_model.pb"):

    q1_input = layers.Input((MAX_SENTENCE_LENGTH,), sparse=False)

    q1_embedding = layers.Embedding(input_dim=VOCAB_SIZE, output_dim=200, input_length=MAX_SENTENCE_LENGTH)(
                                    q1_input)
    q1_recurrent = layers.Bidirectional(layer=layers.GRU(128))(q1_embedding)

    q1_dense = layers.Dense(64, activation='relu')(q1_recurrent)

    output_layer = layers.Dense(1, activation='sigmoid')(q1_dense)

    complete_model = tf.keras.Model(inputs=q1_input, outputs=[output_layer])
    print(complete_model.summary())
    complete_model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    input_array = np.concatenate([train_q1_sents, train_q2_sents], axis=1)
    print(input_array.shape)
    complete_model.fit([train_q1_sents, train_q2_sents], y_train, batch_size=256, epochs=5,
                       validation_data=([valid_q1_sents, valid_q2_sents], y_valid))

    complete_model.save("model")

In [None]:
model = tf.keras.models.load_model("model")

In [None]:
test_data = pd.read_csv('input/test.csv')

In [None]:
test_data.question2 = test_data.question2.fillna('How I what can learn android app development?')
test_data.question1 = test_data.question1.fillna('How app development?')
print(test_data.loc[[1046690,1461432,379205,817520,943911,1270024], :])

In [None]:
print(test_data.shape, test_data.columns)
test_q1_sents = convert_to_seqs(test_data.question1)
test_q2_sents = convert_to_seqs(test_data.question2)

max number of words in question 1 = 127

max number of words in question 2 = 237

We will pad sequences to normalize the length to 120, truncating longer ones from the beginning and padding zeroes to shorter ones on the left.

In [None]:
predictions = model.predict(np.concatenate([test_q1_sents, test_q2_sents], axis=1))
submission = pd.DataFrame({'is_duplicate': predictions[:, 0], 'test_id': test_data.test_id})
submission.to_csv('bi_gru_submission.csv', index=False)

In [None]:
training_predictions = model.predict(np.concatenate([train_q1_sents[:100], train_q2_sents[:100]], axis=1))

In [None]:
for i, a in enumerate(zip(training_predictions, y_train[:100])):
    print(i, a[0][0], a[1])

In [None]:
X_train.iloc[12]