In [None]:
import torch
import numpy as np
import pandas as pd
from torch import nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.functional import F
import spacy
import string
import re
import numpy as np
from spacy.symbols import ORTH
from collections import Counter
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import keras


In [None]:
batch_size = 100
embedding_size=50
hidden_dim=100
epochs=30
learning_rate=0.001

In [None]:
PATH = Path('../input/quora-question-pairs/quora-question-pairs/')

In [None]:
list(PATH.iterdir())

In [None]:
train_path = PATH/'train.csv'
val_path = PATH/'test.csv'


In [None]:
train = pd.read_csv(str(train_path))
test = pd.read_csv(str(val_path))

In [None]:
train=train.sample(15000)
test=test.sample(5000)

In [None]:
train.fillna('',inplace=True)
test.fillna('',inplace=True)

## Tokens!

In [None]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x):return re_br.sub("\n", x)
my_tok = spacy.load('en')
def spacy_tok(x): 
    try:
        return [tok.text for tok in my_tok.tokenizer(sub_br(x))]
    except:
        return []
#         #isnan
#         return []

In [None]:
w1 = list(train['question1'].apply(lambda x: set(x.split())))
w2 = list(train['question2'].apply(lambda x: set(x.split())))
w3 = list(test['question1'].apply(lambda x: set(x.split())))
w4 = list(test['question2'].apply(lambda x: set(x.split())))
total_words = set.intersection(*w1)|set.intersection(*w2)|set.intersection(*w3)|set.intersection(*w4)

total_words=set.intersection(*total_words)
print(total_words)
num_words = len(total_words)

In [None]:
len(total_words)

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
t = Tokenizer()
t.fit_on_texts(total_words)

In [None]:
t.word_counts

In [None]:
train['question1']=train['question1'].apply(spacy_tok)
train['question2']=train['question2'].apply(spacy_tok)

In [None]:
test['question1']=test['question1'].apply(spacy_tok)
test['question2']=test['question2'].apply(spacy_tok)

## Make Counter

In [None]:
counts = Counter()
for question_words in train['question1']:
    counts.update(question_words)
for question_words in train['question2']:
    counts.update(question_words)
for question_words in test['question1']:
    counts.update(question_words)
for question_words in test['question2']:
    counts.update(question_words)

In [None]:
len(counts)

Delete rare words

In [None]:
for word in list(counts):
    if counts[word] < 3:
        del counts[word]

In [None]:
tokenizer = keras.preprocessing.text.one_hot(text, n=len(counts), filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')

In [None]:
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:


# note that spacy_tok takes a while run it just once
def encode_sentence(word_list, vocab2index=vocab2index, N=embedding_size, padding_start=False):
    x = word_list
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in x])
    l = min(N, len(enc1))
    if padding_start:
        enc[:l] = enc1[:l]
    else:
        enc[N-l:] = enc1[:l]
    return enc, l



In [None]:
train['question1']=train['question1'].apply(encode_sentence)
train['question2']=train['question2'].apply(encode_sentence)

In [None]:
val['question1']=val['question1'].apply(encode_sentence)
val['question2']=val['question2'].apply(encode_sentence)

number of words for embeddings

In [None]:
num_words=len(words)
num_words

## Dataset

In [None]:
val.head()

In [None]:
class Question_Dataset(Dataset):
    def __init__(self,df,train):
    
        self.y = torch.Tensor(df['is_duplicate'].values)
        self.x1 = df['question1']
        self.x2 = df['question2']
        
        
    def __getitem__(self,idx):
        x1, s1 = self.x1.loc[idx]
        x2, s2 = self.x2.loc[idx]
        x1=torch.Tensor(x1)
        x2=torch.Tensor(x2)
        return({"x1":x1,'x2':x2,"s1":s1,'s2':s2,'y':self.y[idx]})
    def __len__(self):
        return len(self.y)

In [None]:
train.reset_index(inplace=True)
val.reset_index(inplace=True)

In [None]:
train_ds = Question_Dataset(train,train=True)
val_ds = Question_Dataset(val,train=True)

## Model

In [None]:
class Questionnaire(nn.Module):
    def __init__(self):
        super(Questionnaire,self).__init__()
        self.lstm = nn.LSTM(embedding_size, hidden_dim, batch_first=True).cuda()
        self.embedding =nn.Embedding(num_words,embedding_size, padding_idx=0)
        self.dropout = nn.Dropout(0.5)
    def forward(self,x,s):
        s, sort_index = torch.sort(s, 0,descending=True)
        s = s.long().cpu().numpy().tolist()
        x=self.embedding(x)
        x=self.dropout(x)
        x_pack = pack_padded_sequence(x.float(), list(s), batch_first=True)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out=ht[-1]
        return torch.zeros_like(out).scatter_(0, sort_index.unsqueeze(1).expand(-1,out.shape[1]), out)
         

In [None]:
def val_metrics(model, valid_dl,eval_metric=F.nll_loss):
    model.eval()
    total = 0
    sum_loss = 0
    sum_loss2=0
    correct = 0 
    rand_int = np.random.randint(len(valid_dl),size=1)
    
    for i, input in enumerate(valid_dl):
        if i in rand_int:
            x1 = input['x1'].cuda().long()
            x2 = input['x2'].cuda().long()
            s1 = input['s1'].cuda().long()
            y = input['y'].cuda().float()

            s2 = input['s2'].cuda().long()
            y_hat_1 = model(x1,s1)
            y_hat_2 = model(x2,s2)
            DISTANCE = torch.exp(-torch.abs(y_hat_2-y_hat_1).sum(-1))
#             DISTANCE = DISTANCE.unsqueeze(1)
#             DISTANCE = torch.cat([1-DISTANCE,DISTANCE],1).float()
            xt1 = [words[int(x)] for x in x1[0]]
            xt2 = [words[int(x)] for x in x2[0]]
            loss = F.mse_loss(DISTANCE,y)

            print('Sentence 1: ',' '.join(xt1))
            print('Sentence 2:',' '.join(xt2))
            print('Prediction:',str(float(DISTANCE[0])))
            print('Actual:',str(float(y[0])))
        x1 = input['x1'].cuda().long()
        x2 = input['x2'].cuda().long()
        s1 = input['s1'].cuda().long()
        s2 = input['s2'].cuda().long()
        y_hat_1 = model(x1,s1)
        y_hat_2 = model(x2,s2)

        DISTANCE = torch.exp(-torch.abs(y_hat_2-y_hat_1).sum(-1))
        DISTANCE = DISTANCE.unsqueeze(1)
        DISTANCE = torch.cat([1-DISTANCE,DISTANCE],1).float()

        y = input['y'].cuda().long()

        loss = eval_metric(DISTANCE,y)
        batch=y.shape[0]

        sum_loss += batch*(loss.item())
        total += batch
    print("Validation Log Loss: ", sum_loss/total)
    return sum_loss/total

In [None]:
def train_routine(model,train_ds,valid_ds,epochs,eval_metric=F.mse_loss):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(),learning_rate)
    train_dl = DataLoader(train_ds,batch_size,True)
    val_dl = DataLoader(val_ds,batch_size,True)
    valid_errors = []
    train_errors = []
    for epoch in range(epochs):
        model.train()

        sum_loss=0
        total=0
        for i, input in enumerate(train_dl):
            optimizer.zero_grad()
            x1 = input['x1'].cuda().long()
            x2 = input['x2'].cuda().long()
            s1 = input['s1'].cuda().long()
            s2 = input['s2'].cuda().long()

            y_hat_1 = model(x1,s1)
            y_hat_2 = model(x2,s2)
            DISTANCE = torch.exp(-torch.abs(y_hat_2-y_hat_1).sum(-1))
#             DISTANCE = DISTANCE.unsqueeze(1)
#             DISTANCE = torch.cat([1-DISTANCE,DISTANCE],1).float()
            
            y = input['y'].float().cuda()
            
            loss = eval_metric(DISTANCE,y)
            loss.backward()
            total+=y.shape[0]
            sum_loss+=loss.item()
            optimizer.step()
        print("Training Mean Squared error: ", sum_loss/total)
        train_errors.append(sum_loss/total)
        valid_errors.append(val_metrics(model,val_dl))
        print()
    return train_errors, valid_errors

In [None]:
model = Questionnaire().cuda()

In [None]:
model=model.cuda()

In [None]:
learning_rate=0.0001

train_errors, val_errors = train_routine(model,train_ds,val_ds,epochs,eval_metric=F.mse_loss)

In [None]:
plt.plot(train_errors)

plt.show()


In [None]:
plt.plot(val_errors)
plt.show()

In [None]:
learning_rate=0.001
newtrain_errors, newval_errors = train_routine(model,train_ds,val_ds,epochs,eval_metric=F.mse_loss)

In [None]:
epochs=5
learning_rate=0.0001
newesttrain_errors, newestval_errors = train_routine(model,train_ds,val_ds,epochs)

In [None]:
plt.plot(train_errors+newtrain_errors+newesttrain_errors)

In [None]:
plt.plot(val_errors+newval_errors+newestval_errors)

In [None]:
len(test)

In [None]:
test['question1']=test['question1'].apply(spacy_tok)
test['question2']=test['question2'].apply(spacy_tok)


In [None]:
testcounts = Counter()
for question_words in test['question1']:
    testcounts.update(question_words)
for question_words in test['question2']:
    testcounts.update(question_words)
for word in list(counts):
    if testcounts[word] < 3:
        del testcounts[word]
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in testcounts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
test['question1']=test['question1'].apply(encode_sentence)
test['question2']=test['question2'].apply(encode_sentence)

In [None]:

predictions = []
for row in range(len(test)):
    print(row)
    x1, s1 = test['question1'].loc[row]
    x2, s2 = test['question2'].loc[row]
    x2=torch.Tensor(x1).long().cuda()
    s1=torch.Tensor(s1)
    x1=torch.Tensor(x1).long().cuda()
    s2=torch.Tensor(s2)
    curr = 50-s1.shape[0]
    attach = torch.zeros(curr)
    s1=torch.cat([attach,s1])
    curr = 50-s2.shape[0]
    attach = torch.zeros(curr)
    s2=torch.cat([attach,s2])
    print(s1)
    y_hat_1 = model(x1,s1)
    y_hat_2 = model(x2,s2)
    prediction = torch.exp(-torch.abs(y_hat_2-y_hat_1).sum(-1))
    predictions.append(prediction)

In [None]:
my_submission = pd.DataFrame({'test_id': np.array( range(len(predictions))), 'is_duplicate':np.array( predictions)})
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission

In [None]:
# import the modules we'll need
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "submission.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a random sample dataframe
df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))

# create a link to download the dataframe
create_download_link(df)