In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(color_codes=True)

In [3]:
import tensorflow as tf
import keras
from keras import backend as K
from tensorflow.keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers.core import Lambda
from keras.layers import Input, Concatenate, Flatten, Dense, Embedding, LSTM ,  Multiply, Dropout, Subtract, Add
import torch
!pip install -U torchtext==0.8.0
from torchtext.data import Field
from torchtext.vocab import GloVe

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
import ast

In [12]:
df = pd.read_csv('/content/drive/MyDrive/datasets/train.csv')

# PreProcessing

In [13]:
df.dropna(inplace=True)

In [14]:
def remove_punc_word(word):
  # initializing punctuations string  
  punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~'''
  # Removing punctuations in string 
  # Using loop + punctuation string 
  for ele in word:  
    if ele in punc:  
        word = word.replace(ele, " ")  
  return word

def remove_punc_list(word_list):
  for i in range(len(word_list)):
    word_list[i] = remove_punc_word(word_list[i])
  return word_list

def clear_string(text):
  # Clean the text
  text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
  #text = re.sub(r"what's", "what is ", text)
  text = re.sub(r"\'s", "is", text)
  text = re.sub(r"\'ve", "have", text)
  text = re.sub(r"can't", "cannot", text)
  text = re.sub(r"n't", "not", text)
  text = re.sub(r"\'m", "am", text)
  text = re.sub(r"\'re", "are", text)
  text = re.sub(r"\'d", "would", text)
  text = re.sub(r"\'ll", "will", text)
  #text = re.sub(r",", " ", text)
  #text = re.sub(r"\.", " ", text)
  #text = re.sub(r"!", " ! ", text)
  #text = re.sub(r"\/", " ", text)
  #text = re.sub(r"\^", " ^ ", text)
  #text = re.sub(r"\+", " + ", text)
  #text = re.sub(r"\-", " - ", text)
  #text = re.sub(r"\=", " = ", text)
  #text = re.sub(r"'", " ", text)
  text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
  #text = re.sub(r":", " : ", text)
  text = re.sub(r" e g ", " eg ", text)
  text = re.sub(r" b g ", " bg ", text)
  text = re.sub(r" u s ", " american ", text)
  text = re.sub(r"\0s", "0", text)
  text = re.sub(r" 9 11 ", "911", text)
  text = re.sub(r"e - mail", "email", text)
  text = re.sub(r"j k", "jk", text)
  text = re.sub(r"\s{2,}", " ", text)
  return text

In [15]:
def preprocess_line(line,tokenize=True,punc_remove=True,stem=True,tokenizer=word_tokenize,stops_remove=True,punc_remover= lambda x: [word for word in x if word.isalpha()],stemmer = SnowballStemmer('english'),stop_words=stopwords.words('english')):
  processed_text = line 
  # (1) Tokenizing
  if tokenize:
    processed_text = tokenizer(line)
  # (1.1) Cleaning String
  processed_text =  [clear_string(word) for word in processed_text]
  # (2) Stemming
  if stem:
    processed_text = [stemmer.stem(word) for word in processed_text]
  # (3) Stop words
  if stops_remove:
    processed_text = [word for word in processed_text if word not in stop_words]
  # (4) Removing Punc (Default: Remove everything except pure text)
  if punc_remove:
    processed_text = punc_remover(processed_text)
  # (5) Spaces Removal
  processed_text = list(filter(lambda x: x and x.strip(),processed_text))
  return processed_text

In [16]:
df['question1'] = df['question1'].apply(lambda x: preprocess_line(x,punc_remover=remove_punc_list))

In [17]:
df['question2'] = df['question2'].apply(lambda x: preprocess_line(x,punc_remover=remove_punc_list)) 

In [18]:
df['question1_len'] = df['question1'].apply(lambda x: len(x))
df['question2_len'] = df['question2'].apply(lambda x: len(x))

In [19]:
text_field = Field()
embedder = GloVe(name='6B', dim=300)
text_field.build_vocab(
    pd.concat([df['question1'],df['question2']],ignore_index=True), 
    vectors=embedder
)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                           
100%|█████████▉| 399999/400000 [00:33<00:00, 12021.53it/s]


In [20]:
vocab = text_field.vocab
len(vocab)

88350

In [21]:
df['question1'] = df['question1'].apply(lambda x: [vocab[word] for word in x])
df['question2'] = df['question2'].apply(lambda x: [vocab[word] for word in x])

# Padding and embedding


In [22]:
MAX_LENGTH = 30
EMBEDDING_DIM = 300

In [23]:
q1_seq = pad_sequences(df['question1'], maxlen=MAX_LENGTH, padding='post')
q2_seq = pad_sequences(df['question2'], maxlen=MAX_LENGTH, padding='post')

In [24]:
def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

In [25]:
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
    return tf.py_function(roc_auc_score, (y_true, y_pred), tf.double)

# Model

In [26]:
input_1 = Input(shape=(q1_seq.shape[1],))
input_2 = Input(shape=(q2_seq.shape[1],))


common_embed = Embedding(name="synopsis_embedd",input_dim =len(vocab), 
                       output_dim=EMBEDDING_DIM,weights=[vocab.vectors], 
                       input_length=q1_seq.shape[1],trainable=False) 
lstm_1 = common_embed(input_1)
lstm_2 = common_embed(input_2)


common_lstm = LSTM(64,return_sequences=True, activation="relu")
vector_1 = common_lstm(lstm_1)
vector_1 = Flatten()(vector_1)

vector_2 = common_lstm(lstm_2)
vector_2 = Flatten()(vector_2)

x3 = Subtract()([vector_1, vector_2])
x3 = Multiply()([x3, x3])

x1_ = Multiply()([vector_1, vector_1])
x2_ = Multiply()([vector_2, vector_2])
x4 = Subtract()([x1_, x2_])
    
    #https://stackoverflow.com/a/51003359/10650182
x5 = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([vector_1, vector_2])
    
conc = Concatenate(axis=-1)([x5,x4, x3])

x = Dense(100, activation="relu", name='conc_layer')(conc)
x = Dropout(0.01)(x)
out = Dense(1, activation="sigmoid", name = 'out')(x)

model = Model([input_1, input_2], out)

model.compile(loss="binary_crossentropy", metrics=['acc',auroc], optimizer=Adam(0.00001))



In [27]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 30)]         0           []                               
                                                                                                  
 synopsis_embedd (Embedding)    (None, 30, 300)      26505000    ['input_1[0][0]',                
                                                                  'input_2[0][0]']                
                                                                                                  
 lstm (LSTM)                    (None, 30, 64)       93440       ['synopsis_embedd[0][0]',    

In [28]:
number_of_pairs = len(q1_seq)

q1_seq_val = q1_seq[int(0.8*number_of_pairs):number_of_pairs]
q2_seq_val = q2_seq[int(0.8*number_of_pairs):number_of_pairs]
y_val= df['is_duplicate'].iloc[int(0.8*number_of_pairs):number_of_pairs]

q1_seq = q1_seq[:int(0.8*number_of_pairs)]
q2_seq = q2_seq[:int(0.8*number_of_pairs)]
y_train = df['is_duplicate'].iloc[:int(0.8*number_of_pairs)]

In [None]:
model.fit([q1_seq,q2_seq],y_train.values.reshape(-1,1), epochs = 5,batch_size=64, validation_data=( [q1_seq_val,q2_seq_val],y_val.values.reshape(-1,1) ) )

Epoch 1/5
Epoch 2/5
Epoch 3/5