In [2]:
import numpy as np
import pandas as pd
import string
import spacy
import re

In [2]:
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

In [13]:
df = pd.read_csv("../Dataset/train.csv")

In [None]:
df.head()

In [3]:
from gensim.models import KeyedVectors

embedding = KeyedVectors.load_word2vec_format("../Waste/GoogleNews-vectors-negative300.bin", binary=True)  # C bin format

In [6]:
# del embedding

In [7]:
def clean_text(data):
    data = str(data)
    # Replacing numbers with '#'
    for i in [8, 7, 6, 5, 4, 3, 2, 1]:
        temp = "".join(['#' for j in range(i)])
        data = re.sub(f"[0-9]{{{i}}}", temp, data)
    misspell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                "He'll":'He will',
                "She'll":'She will',
                "i'm": "I am",
                "e-mail": "email",
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium',
                "₹": " rupee ",
                '&': " & ",
                '/': " ",
                '-': " ",
                }
    for key, value in misspell_dict.items():
        data = re.sub(key, value, data)
    return data

In [8]:
df["question1"] = df["question1"].fillna("").apply(clean_text)
df["question2"] = df["question2"].fillna("").apply(clean_text)

In [9]:
df.to_csv("../Dataset/preprocessed_df.csv")

In [10]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=10000, shuffle=True, stratify=df['is_duplicate'])

In [9]:
# commented to not accidentally override previously saved files unless you manually uncomment it

# train_df.to_csv("Dataset/train_df.csv")
# test_df.to_csv("Dataset/test_df.csv")

In [4]:
train_df = pd.read_csv("../Dataset/train_df.csv")
test_df = pd.read_csv("../Dataset/test_df.csv")

In [4]:
q1_train = train_df['question1'].values
q2_train = train_df['question2'].values

In [5]:
np.save('../Dataset/q1_train.npy', q1_train)
np.save('../Dataset/q2_train.npy', q2_train)

In [6]:
q1_test = test_df['question1'].values
q2_test = test_df['question2'].values

In [7]:
np.save('../Dataset/q1_test.npy', q1_test)
np.save('../Dataset/q2_test.npy', q2_test)

In [None]:
y_train = train_df['is_duplicate'].values
y_test = test_df['is_duplicate'].values

In [None]:
np.save('../Dataset/y_train.npy', y_train)
np.save('../Dataset/y_test.npy', y_test)

In [None]:
# Tokenizing train set
q1_train_npy = np.empty_like(q1_train)
q2_train_npy = np.empty_like(q2_train)

In [56]:
# Tokenizing and creating Vocabulary list
from collections import defaultdict
# lambda: 0 makes the dictionary return value 0 when the key is not present in it but accessed
vocab = defaultdict(lambda: 0)
# '<PAD>' = 1 is used for padding the sequences
vocab['<PAD>'] = 1
# 0 is to represent words that are not in vocab and the index of word in voab_idx = vocab[word]
vocab_idx = [0, '<PAD>']
# These punctuations are not in word_embeddings. So we will not keep them in our train sequences
punctuation = {'?', '!', '.', ',', '"', "'", '(', ')', '-', '/', ':', ';', '<', '[', '\\', ']', '{', '|', '}', '“', '”', '’', ''}
for i in range(len(q1_train)):
    q1_train_npy[i] = [token.text for token in nlp(q1_train[i]) if token.text not in punctuation]
    q2_train_npy[i] = [token.text for token in nlp(q2_train[i]) if token.text not in punctuation]
    for word in q1_train_npy[i]+q2_train_npy[i]:
        if word in stopwords and word not in embedding.vocab:
            continue
        if word not in vocab:
            vocab[word] = len(vocab) + 1
            vocab_idx.append(word)

In [57]:
np.save('../Dataset/q1_train_tokenized.npy', q1_train_npy)
np.save('../Dataset/q2_train_tokenized.npy', q2_train_npy)

In [77]:
# Tokenizing test set
q1_test_npy = np.empty_like(q1_test)
q2_test_npy = np.empty_like(q2_test)

In [78]:
# These punctuations are not in word_embeddings. So we will not keep them in our test sequences
punctuation = {'?', '!', '.', ',', '"', "'", '(', ')', '-', '/', ':', ';', '<', '[', '\\', ']', '{', '|', '}', '“', '”', '’', ''}
for i in range(len(q1_test)):
    q1_test_npy[i] = [token.text for token in nlp(q1_test[i]) if token.text not in punctuation]
    q2_test_npy[i] = [token.text for token in nlp(q2_test[i]) if token.text not in punctuation]

In [79]:
np.save('../Dataset/q1_test_tokenized.npy', q1_test_npy)
np.save('../Dataset/q2_test_tokenized.npy', q2_test_npy)

In [63]:
# saving the vocab_idx for future use
np.save('../Dataset/vocab_idx.npy', vocab_idx)

In [70]:
# Adding vocab defaultdict to list format: Ex: vocab['Hello']=1; then vocab_list=[['Hello', 1]]
vocab_list = []
for key, value in vocab.items():
    vocab_list.append((key, value))

In [74]:
# saving the above created vocab_list for future use
np.save("../Dataset/vocab_list.npy", vocab_list)

In [2]:
# Reloading the vocab-idx and creating unique vocab from the loaded file
vocab_idx = np.load("../Dataset/vocab_idx.npy", allow_pickle='True')

# Recreating vocab defaultdict from vocab_idx file
from collections import defaultdict
vocab = defaultdict(lambda:0)
for idx, word in enumerate(vocab_idx[1:]):
    vocab[word] = idx+1

In [58]:
# Encoding the tokenized train sequences
for i in range(len(q1_train_npy)):
    q1_train_npy[i] = [vocab[word] for word in q1_train_npy[i]]
    q2_train_npy[i] = [vocab[word] for word in q2_train_npy[i]]

In [59]:
np.save('../Dataset/q1_train_encoded.npy', q1_train_npy)
np.save('../Dataset/q2_train_encoded.npy', q2_train_npy)

In [80]:
# Encoding the tokenized test sequences
for i in range(len(q1_test_npy)):
    q1_test_npy[i] = [vocab[word] for word in q1_test_npy[i]]
    q2_test_npy[i] = [vocab[word] for word in q2_test_npy[i]]

In [81]:
# np.save('../Dataset/q1_test_encoded.npy', q1_test_npy)
# np.save('../Dataset/q2_test_encoded.npy', q2_test_npy)

In [8]:
# Can't directly save the vocab file of type defaultdict because of lambda
# PicklingError: Can't pickle <function <lambda> at 0x00000223530A1CA0>: attribute lookup <lambda> on __main__ failed


# np.save('Dataset/vocab.npy', vocab)

In [4]:
# getting embedding_matrix from the loaded pre-trained word embedding
embed_dim = 300
embedding_matrix = np.empty(shape=(len(vocab_idx), embed_dim))
embedding_matrix[0] = [0 for i in range(embed_dim)] # Words which are not in vocab but in questions 
embedding_matrix[1] = [0 for i in range(embed_dim)] # <PAD>: To ignore padding
for word in vocab_idx[2:]:
    if word in embedding.vocab:
        embedding_matrix[vocab[word]] = embedding[word]

In [6]:
for i in range(len(embedding_matrix)):
    assert len(embedding_matrix[i]) == 300

In [5]:
np.save("../Dataset/embedding_matrix.npy", embedding_matrix)

In [10]:
# Maximum sequence length is taken as the length of the largest question in our train set
# with max_seq_len=60 it covers more than 99.985% questions within this range
max_seq_len = 60

In [67]:
from keras.preprocessing.sequence import pad_sequences

q1_train_npy = pad_sequences(q1_train_npy, maxlen=max_seq_len, value=1)
q2_train_npy = pad_sequences(q2_train_npy, maxlen=max_seq_len, value=1)

In [121]:
np.save('../Dataset/q1_train_padded.npy', q1_train_npy)
np.save('../Dataset/q2_train_padded.npy', q2_train_npy)

In [82]:
q1_test_npy = pad_sequences(q1_test_npy, maxlen=max_seq_len, value=1)
q2_test_npy = pad_sequences(q2_test_npy, maxlen=max_seq_len, value=1)

In [83]:
np.save('../Dataset/q1_test_padded.npy', q1_test_npy)
np.save('../Dataset/q2_test_padded.npy', q2_test_npy)

In [2]:
q1_train_padded = np.load("../Dataset/q1_train_padded.npy", allow_pickle='True')
q2_train_padded = np.load("../Dataset/q2_train_padded.npy", allow_pickle='True')

In [3]:
train_manual_features_npy = np.load("../manual_features/train_manual_features_npy.npy", allow_pickle='True')

In [4]:
y_train = np.load("../Dataset/y_train.npy", allow_pickle='True')

In [5]:
vocab_idx = np.load("../Dataset/vocab_idx.npy", allow_pickle='True')

In [6]:
# Recreating vocab defaultdict from vocab_idx file
from collections import defaultdict
vocab = defaultdict(lambda:0)
for idx, word in enumerate(vocab_idx[1:]):
    vocab[word] = idx+1

In [7]:
embedding_matrix = np.load("../Dataset/embedding_matrix.npy", allow_pickle='True')

In [8]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=40000, random_state=42)

In [9]:
for train_idx, valid_idx in sss.split(q1_train_padded, y_train):
    q1_train_final, q2_train_final = q1_train_padded[train_idx], q2_train_padded[train_idx]
    q1_valid_final, q2_valid_final = q1_train_padded[valid_idx], q2_train_padded[valid_idx]
    train_manual_features_final = train_manual_features_npy[train_idx]
    valid_manual_features_final = train_manual_features_npy[valid_idx]
    y_train_final, y_valid_final = y_train[train_idx], y_train[valid_idx]

In [23]:
np.save("../Dataset/q1_train_final.npy", q1_train_final)
np.save("../Dataset/q2_train_final.npy", q2_train_final)
np.save("../Dataset/q1_valid_final.npy", q1_valid_final)
np.save("../Dataset/q2_valid_final.npy", q2_valid_final)
np.save("../Dataset/train_manual_features_final.npy", train_manual_features_final)
np.save("../Dataset/valid_manual_features_final.npy", valid_manual_features_final)
np.save("../Dataset/y_train_final.npy", y_train_final)
np.save("../Dataset/y_valid_final.npy", y_valid_final)

In [11]:
np.save("../Dataset/MyVersion/q1_train_final.npy", q1_train_final)
np.save("../Dataset/MyVersion/q2_train_final.npy", q2_train_final)
np.save("../Dataset/MyVersion/q1_valid_final.npy", q1_valid_final)
np.save("../Dataset/MyVersion/q2_valid_final.npy", q2_valid_final)
np.save("../Dataset/MyVersion/train_manual_features_final.npy", train_manual_features_final)
np.save("../Dataset/MyVersion/valid_manual_features_final.npy", valid_manual_features_final)
np.save("../Dataset/MyVersion/y_train_final.npy", y_train_final)
np.save("../Dataset/MyVersion/y_valid_final.npy", y_valid_final)

In [24]:
q1_train_final = np.load("../Dataset/q1_train_final.npy", allow_pickle=True)