# Multi Input networks
- In many cases, the problem and dataset of interest requires multiple inputs to a prediction model
- In this tutorial, we learn how to implement and train multi-input networks with the Quora question retrieval dataset

In [3]:
import numpy as np
import pandas as pd
import torch, torchvision
import torch.nn as nn
import torch.nn.functional as F
torch.__version__

'1.3.1'

## 1. Import & process dataset
- Quora question retrieval dataset
- Find out whether the two questions are equivalent or not (i.e., duplicate)
- Source: http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [6]:
data = pd.read_csv("http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv", sep = "\t")
print(data.shape)
data.head()

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [0]:
# we use only 10000 first instances and 10 tokens for each question for faster training in this tutorial
# you can try learning with all instances yourself and compare the results!
num_instances, max_length = 10000, 10

data = data.iloc[:num_instances]
q1 = [str(x).split() for x in data["question1"]]
q2 = [str(x).split() for x in data["question2"]]

In [0]:
# gather unique tokens and convert into list for indexing
unique_tokens = set()
for i in range(len(q1)):
  unique_tokens.update(q1[i])
  unique_tokens.update(q2[i])
unique_tokens = list(unique_tokens)

In [0]:
# create X, y data
# note that there are two X data (X_data_1, X_data_2)!
y_data = data["is_duplicate"].values
X_data_1, X_data_2 = np.zeros((num_instances, max_length)), np.zeros((num_instances, max_length))

for i in range(len(q1)):
  if len(q1[i]) < max_length:
    for j in range(len(q1[i])):
      X_data_1[i][j] = unique_tokens.index(q1[i][j])
  else:
    for j in range(max_length):
      X_data_1[i][j] = unique_tokens.index(q1[i][j])

for i in range(len(q2)):
  if len(q2[i]) < max_length:
    for j in range(len(q2[i])):
      X_data_2[i][j] = unique_tokens.index(q2[i][j])
  else:
    for j in range(max_length):
      X_data_2[i][j] = unique_tokens.index(q2[i][j])

In [0]:
# generate dataset and data loader instances
# we use SubsetRandomSampler to sample test instances here
from torch.utils.data.sampler import SubsetRandomSampler

class QuoraDataset(torch.utils.data.Dataset):
  def __init__(self):
    self.x1 = X_data_1
    self.x2 = X_data_2
    self.y = y_data.astype("int64")
  
  def __getitem__(self, idx):
    return self.x1[idx], self.x2[idx], self.y[idx]
  
  def __len__(self):
    return len(self.x1)

dataset = QuoraDataset()
NUM_INSTANCES = len(dataset)
TEST_RATIO = 0.3
TEST_SIZE = int(NUM_INSTANCES * 0.3)

indices = list(range(NUM_INSTANCES))

test_idx = np.random.choice(indices, size = TEST_SIZE, replace = False)
train_idx = list(set(indices) - set(test_idx))
train_sampler, test_sampler = SubsetRandomSampler(train_idx), SubsetRandomSampler(test_idx)

train_loader = torch.utils.data.DataLoader(dataset, batch_size = 128, sampler = train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size = 128, sampler = test_sampler)

## 2. Creating Multi Input Network

- Create and train Multi Input Network for question retrieval

In [0]:
# create CNN with one convolution/pooling layer
class net(nn.Module):
  def __init__(self, input_dim, embedding_dim, num_words, num_hidden_cells):
    super(net, self).__init__()
    self.embedding = nn.Embedding(num_words, embedding_dim)
    self.dense_1 = nn.Linear(embedding_dim * input_dim, num_hidden_cells)
    self.dense_2 = nn.Linear(embedding_dim * input_dim, num_hidden_cells)
    self.final_dense = nn.Linear(num_hidden_cells * 2, 2)     
    
  def forward(self, x1, x2):
    x1, x2 = self.embedding(x1), self.embedding(x2)
    x1, x2 = x1.view(x1.size(0), -1), x2.view(x2.size(0), -1)
    x1, x2 = self.dense_1(x1), self.dense_2(x2)
    x = torch.cat((x1, x2), dim = 1)
    x = self.final_dense(x)
    return x

In [0]:
# hyperparameters
INPUT_DIM = max_length
EMBEDDING_DIM = 50
NUM_WORDS = len(unique_tokens)
HIDDEN_SIZE = 30
LEARNING_RATE = 1e-2
NUM_EPOCHS = 10

In [0]:
model = net(INPUT_DIM, EMBEDDING_DIM, NUM_WORDS, HIDDEN_SIZE)
criterion = nn.CrossEntropyLoss()   # do not need softmax layer when using CEloss criterion
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [39]:
# training for NUM_EPOCHS
for i in range(NUM_EPOCHS):
  temp_loss = []
  for x1, x2, y in train_loader:
    x1, x2 = x1.long(), x2.long()
    outputs = model(x1, x2)
    loss = criterion(outputs, y)
    temp_loss.append(loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
  print("Loss at {}th epoch: {}".format(i, np.mean(temp_loss)))

Loss at 0th epoch: 8.358021567057056e-05
Loss at 1th epoch: 6.927385127875658e-05
Loss at 2th epoch: 5.8683096689161506e-05
Loss at 3th epoch: 5.074793185816485e-05
Loss at 4th epoch: 4.445249596756185e-05
Loss at 5th epoch: 3.919280668262879e-05
Loss at 6th epoch: 3.4757104962905446e-05
Loss at 7th epoch: 3.1304369787738486e-05
Loss at 8th epoch: 2.82746555802243e-05
Loss at 9th epoch: 2.5681897552038372e-05


## 3. Evaluation
- Evaluate the trained multi input model with accuracy score 
  - Store probability of each instance to a list and compare it with true y label

In [40]:
y_pred, y_true = [], []
with torch.no_grad():
  for x1, x2, y in test_loader:
    x1, x2 = x1.long(), x2.long()
    outputs = F.softmax(model(x1, x2)).max(1)[-1]       # predicted label
    y_true += list(y.numpy())                # true label
    y_pred += list(outputs.numpy())   

  """


In [41]:
# evaluation result
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.6016666666666667