In [66]:
from itertools import chain

import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [3]:
d_train = pd.read_csv("../data/text_clean/train.csv")
d_test = pd.read_csv("../data/text_clean/test.csv")

In [5]:
# tokenize
d_train.loc[:, 'title_1_token'] = d_train.title_1_pre.apply(word_tokenize)
d_train.loc[:, 'title_2_token'] = d_train.title_2_pre.apply(word_tokenize)

d_test.loc[:, 'title_1_token'] = d_test.title_1_pre.apply(word_tokenize)
d_test.loc[:, 'title_2_token'] = d_test.title_2_pre.apply(word_tokenize)

In [9]:
title_token = list(chain(*d_train.title_1_token.tolist() + d_train.title_2_token.tolist()))

In [12]:
vocab_token = list(set(title_token))

In [14]:
word2idx = dict((w, k) for k, w in enumerate(vocab_token, 2))
idx2word = dict((k, w) for k, w in enumerate(vocab_token, 2))

In [25]:
word2idx['<UNK>'] = 1
idx2word[1] = '<UNK>'
word2idx['<PAD>'] = 0
idx2word[0] = '<PAD>'

In [76]:
class TitleDataset():
    def __init__(self, data, test, word2idx, idx2word):
        train, val = train_test_split(data)
        train.reset_index(drop=True, inplace=True)
        val.reset_index(drop=True, inplace=True)
        self.word2idx = word2idx
        self.idx2word = idx2word
        self.dataset = {
            'train': (train, train.shape[0]),
            'val': (val, val.shape[0]),
            'test': (test, test.shape[0])
        }
        self.set_split('train')
        
    def set_split(self, split='train'):
        self.data, self.length = self.dataset[split]
    
    def encode(self, text):
        token_ids = []
        for word in text:
            try:
                token_ids.append(self.word2idx[word])
            except:
                token_ids.append(1)
        token_ids = torch.LongTensor(token_ids)
        return token_ids
    
    def decode(self, ids):
        words = []
        for id_ in ids:
            try:
                words.append(self.idx2word[id_])
            except:
                words.append('<UNK>')
                
        return words
    
    def __getitem__(self, idx):
        t1 = self.data.loc[idx, 'title_1_token']
        t2 = self.data.loc[idx, 'title_2_token']
        label = self.data.loc[idx, 'Label']
        t1_encode = self.encode(t1)
        t2_encode = self.encode(t2)
        
        return t1_encode, t2_encode, label
    
    def __len__(self):
        return self.length

In [77]:
dataset = TitleDataset(d_train, d_test, word2idx, idx2word)

In [78]:
num_vocab = len(word2idx)

In [102]:
num_vocab

8476

In [79]:
class TextEncoder(nn.Module):
    def __init__(self, num_vocab, emb_size, hid_size, num_layers):
        super(TextEncoder, self).__init__()
        self.network = nn.Sequential(
            nn.Embedding(num_vocab, emb_size),
            nn.LSTM(emb_size, hid_size, num_layers=num_layers, batch_first=True)
        )
        
    def forward(self, input_):
        out = self.network(input_)
        
        return out

In [80]:
model = TextEncoder(num_vocab, 768, 512, 1)

In [81]:
num_params = sum(p.numel() for p in model.parameters())

In [82]:
print(f"Trainable parameters {num_params:,}")

Trainable parameters 9,135,104


In [83]:
def pad_text(batch_data):
    t1, t2, label = list(zip(*batch_data))
    t1 = pad_sequence(t1, batch_first=True)
    t2 = pad_sequence(t2, batch_first=True)
    label = torch.LongTensor(label)
    return (t1, t2, label)

In [84]:
dataset.set_split("train")
data_gen = DataLoader(dataset, batch_size = 2, collate_fn=pad_text)

In [89]:
for title1, title2, label in data_gen:
    break

In [92]:
title1.shape

torch.Size([2, 14])

In [93]:
title2.shape

torch.Size([2, 12])

In [103]:
out1, (h, c) = model(title1)
out2, (h, c) = model(title2)

In [105]:
out1.shape

torch.Size([2, 14, 512])

In [106]:
out2.shape

torch.Size([2, 12, 512])