In [1]:
!pip install torch torchtext torchvision
from google.colab import drive
drive.mount('/content/gdrive')

Collecting torch
[?25l  Downloading https://files.pythonhosted.org/packages/49/0e/e382bcf1a6ae8225f50b99cc26effa2d4cc6d66975ccf3fa9590efcbedce/torch-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (519.5MB)
[K    100% |████████████████████████████████| 519.5MB 32kB/s 
tcmalloc: large alloc 1073750016 bytes == 0x584de000 @  0x7ff8b47d81c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8
[?25hCollecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/78/90/474d5944d43001a6e72b9aaed5c3e4f77516fbef2317002da2096fd8b5ea/torchtext-0.2.3.tar.gz (42kB)
[K    100% |████████████████████████████████| 51kB 13.6MB/s 
[?25hCollecting torchvision
[?25l  Downloading https://files.pythonhosted.org/packages/ca/0d/f00b2885711e08bd71242ebe7b96561e6f6d01fdb4b9dcf4d37e2e13

In [0]:
import torch
from torch import nn, optim
import torch.nn.functional as f
from torch.autograd import Variable
from torchtext.vocab import GloVe
from torchtext import data
import numpy as np

In [0]:
class WikiQA_dataset():
  def __init__(self,base_dir,embedding_dim,batch_size):
    self.RAW = data.RawField()
    self.TEXT = data.Field(batch_first=True,
                           lower=True)
    self.LABEL = data.Field(sequential=False,
                            unk_token=None)


    self.train, self.dev, self.test = data.TabularDataset.splits(
        path=base_dir,
        train='WikiQA-train.tsv',
        validation='WikiQA-dev.tsv',
        test='WikiQA-test.tsv',
        format='tsv',
        fields=[('qid', self.RAW),
                ('question', self.TEXT),
                ('docid', self.RAW),
                ('doctitle', self.RAW),
                ('sentenceid',self.RAW),
                ('sentence',self.TEXT),
                ('label',self.LABEL)])

    self.TEXT.build_vocab(self.train, self.dev, self.test, \
                          vectors=GloVe(name='6B', dim=embedding_dim), \
                          unk_init=torch.zeros((1, embedding_dim)).uniform_(-0.25, 0.25))

    self.LABEL.build_vocab(self.train)

    self.train_iter, self.dev_iter, self.test_iter = \
        data.Iterator.splits((self.train, self.dev, self.test),
                             batch_sizes=[batch_size] * 3,
                             shuffle=True,
                             sort_key=lambda x: data.interleave_keys(0, 0))
    
class InsuranceQA_dataset():
  
  def __init__(self,base_dir,embedding_dim,batch_size,use_tokenized=True):
    self.RAW = data.RawField()
    self.TEXT = data.Field(batch_first=True,
                           lower=True)
    self.LABEL = data.Field(sequential=False,
                            unk_token=None)
    
    prefix = 'token' if use_tokenized else 'raw'

    self.train, self.dev, self.test = data.TabularDataset.splits(
        path=base_dir,
        train= prefix+'train.tsv',
        validation= prefix+'valid.tsv',
        test= prefix+'test.tsv',
        format='tsv',
        fields=[('question', self.TEXT),
                ('sentence',self.TEXT),
                ('label',self.LABEL)])

    self.TEXT.build_vocab(self.train, self.dev, self.test, \
                          vectors=GloVe(name='6B', dim=embedding_dim), \
                          unk_init=torch.zeros((1, embedding_dim)).uniform_(-0.25, 0.25))

    self.LABEL.build_vocab(self.train)

    self.train_iter, self.dev_iter, self.test_iter = \
        data.Iterator.splits((self.train, self.dev, self.test),
                             batch_sizes=[batch_size] * 3,
                             shuffle=True,
                             sort_key=lambda x: data.interleave_keys(0, 0))
    
class GloVeEmb(nn.Module):
  
  def __init__(self,emb_dim,dict_size,data,trainable=True):
    super(GloVeEmb,self).__init__()
    self.emb = nn.Embedding(dict_size,emb_dim)
    self.emb.weight.data.copy_(data.TEXT.vocab.vectors)
    self.emb.weight.requires_grad = trainable
    
  def forward(self,x):
    return self.emb(x)
    
class ConvolutionModule(nn.Module):
  
  def __init__(self,emb_dim,dict_size,hidden_dim,ctx_window):
    super(ConvolutionModule,self).__init__()
    self.conv = nn.Conv2d(1,hidden_dim,kernel_size=(ctx_window,emb_dim))
    odd_adjustment = 1 if ctx_window%2==0 else 0
    self.pad = nn.ZeroPad2d((0,0,ctx_window-1-odd_adjustment,ctx_window-1))
    
  def forward(self,x_emb):
    x_emb = x_emb.unsqueeze(1) #adding a dimension (the channel for the convolution)   
    return f.relu(self.conv(self.pad(x_emb))).squeeze(dim=3) # remove the single channel extra dimension

  
class BaselineQACNN(nn.Module):
  
  def __init__(self,emb_dim,dict_size,hidden_dim,ctx_window,data,decision_threshold=0.5):
    super(BaselineQACNN,self).__init__()
    self.emb = GloVeEmb(emb_dim,dict_size,data)
    self.convolution_q = ConvolutionModule(emb_dim,dict_size,hidden_dim,ctx_window)
    self.convolution_a = ConvolutionModule(emb_dim,dict_size,hidden_dim,ctx_window)
    self.h_pool = lambda t : self.horizontal_pooling(t)
    self.crit = nn.CosineEmbeddingLoss(margin=0.5)
    self.decision_threshold = decision_threshold
    self.tostring = {"name":"base_cnn","emb_dim":emb_dim,"hidden_dim":hidden_dim,"ctx_window":ctx_window}
    
  def forward(self,q,a):
    q_embed = self.emb(q)
    a_embed = self.emb(a)
    q_enc = self.convolution_q(q_embed)
    a_enc = self.convolution_a(a_embed)
    r_q = self.h_pool(q_enc)
    r_a = self.h_pool(a_enc)
    return r_q,r_a
  
  # given a matrix, does a maxpool operation on the rows
  # t.view(t.size(0),-1) = flatten into a 1D tensor each outcome for each input sample
  # passing from (n,a,b) to (n,a) dimensions
  # REVIEW squeeze the last dimension (just one channel); .squeeze() will break training with batch_size=1
  def horizontal_pooling(self,t):
    return f.max_pool1d(t,t.size(2)).view(t.size(0),-1)
  
  def compute_batch_stats(self,model_output,batch):
    r_q,r_a = model_output    
    batch_loss = self.crit(r_q,r_a,batch.label.float()*2.0-1) # the loss wants -1|1 values, input values are 0|1
    sim = f.cosine_similarity(r_q,r_a)
    # compute the decision of the network
    pred = sim.clone()
    pred[sim > self.decision_threshold] = 1
    pred[sim <= self.decision_threshold] = 0
    correct_pred = (pred.squeeze() == batch.label.float()).sum().float()
    return sim,batch_loss,correct_pred
  
  
class LSTMModule(nn.Module):
  
  def __init__(self,emb_dim,single_hidden_dim):
    super(LSTMModule,self).__init__()
    self.bilstm = nn.LSTM(emb_dim,single_hidden_dim,bidirectional=True)
    
  def forward(self,x_embed):
    # LSTM returns a tuple, the tensor is the first element
    return self.bilstm(x_embed)[0]
  

class BaselineQAbiLSTM(nn.Module):
  
  def __init__(self,emb_dim,dict_size,single_hidden_dim,data,decision_threshold=0.5):
    super(BaselineQAbiLSTM,self).__init__()
    self.emb = GloVeEmb(emb_dim,dict_size,data)
    self.bilstm_q = LSTMModule(emb_dim,single_hidden_dim)
    self.bilstm_a = LSTMModule(emb_dim,single_hidden_dim)
    self.h_pool = lambda t : self.horizontal_pooling(t)
    self.crit = nn.CosineEmbeddingLoss(margin=0.5)
    self.decision_threshold = decision_threshold
    self.tostring = {"name":"base_lstm","emb_dim":emb_dim,"hidden_dim":single_hidden_dim*2}
    
  def forward(self,q,a):
    q_embed = self.emb(q)
    a_embed = self.emb(a)
    q_enc = self.bilstm_q(q_embed)
    a_enc = self.bilstm_a(a_embed)
    r_q = self.h_pool(q_enc)
    r_a = self.h_pool(a_enc)
    return r_q,r_a
  
  # given a matrix, does a maxpool operation on the rows
  # t.view(t.size(0),-1) = flatten into a 1D tensor each outcome for each input sample
  # passing from (n,a,b) to (n,a) dimensions
  # (n_samples,num_words,emb_size)
  def horizontal_pooling(self,t):
    return f.max_pool1d(t.transpose(1,2),t.size(1)).view(t.size(0),-1)
  
  def compute_batch_stats(self,model_output,batch):
    r_q,r_a = model_output    
    batch_loss = self.crit(r_q,r_a,batch.label.float()*2.0-1) # the loss wants -1|1 values, input values are 0|1
    sim = f.cosine_similarity(r_q,r_a)
    # compute the decision of the network
    pred = sim.clone()
    pred[sim > self.decision_threshold] = 1
    pred[sim <= self.decision_threshold] = 0
    correct_pred = (pred.squeeze() == batch.label.float()).sum().float()
    return sim,batch_loss,correct_pred
  
  
class AttentionMatrix(nn.Module):
  
  def __init__(self,emb_dim):
    super(AttentionMatrix,self).__init__()
    #self.u = nn.Parameter(torch.Tensor(emb_dim,emb_dim).type(torch.FloatTensor),requires_grad=True)
    self.u = nn.Parameter(torch.from_numpy(np.random.normal(size=(emb_dim,emb_dim))).type(torch.FloatTensor),requires_grad=True)
    
  def forward(self,q,a):
    qt = q.transpose(1,2)
    out = torch.matmul(torch.matmul(qt,self.u),a) # Qt*U*A
    return torch.tanh(out)

class AP_CNN(nn.Module):
  
  def __init__(self,emb_dim,dict_size,hidden_dim,ctx_window,data,decision_threshold=0.5):
    super(AP_CNN,self).__init__()
    self.emb = GloVeEmb(emb_dim,dict_size,data)
    self.convolution_q = ConvolutionModule(emb_dim,dict_size,hidden_dim,ctx_window)
    self.convolution_a = ConvolutionModule(emb_dim,dict_size,hidden_dim,ctx_window)
    self.attention_mat = AttentionMatrix(hidden_dim)
    self.h_pool = lambda t : self.horizontal_pooling(t)
    self.v_pool = lambda t : self.vertical_pooling(t)
    self.crit = nn.CosineEmbeddingLoss(margin=0.5)
    self.decision_threshold = decision_threshold
    self.tostring = {"name":"ap_cnn","emb_dim":emb_dim,"hidden_dim":hidden_dim,"ctx_window":ctx_window}
    
  def flatten(self,x):
    return x.view(x.size(0),-1)
    
  def forward(self,q,a):
    q_emb = self.emb(q)
    a_emb = self.emb(a)
    q_enc = self.convolution_q(q_emb)
    a_enc = self.convolution_a(a_emb)
    mat = self.attention_mat(q_enc,a_enc) # check dimensions
    q_att = f.softmax(self.h_pool(mat),dim=1)
    a_att = f.softmax(self.v_pool(mat),dim=1)
    q = self.flatten(torch.matmul(q_enc,q_att))
    a = self.flatten(torch.matmul(a_enc,a_att))
    return q,a
  
  def horizontal_pooling(self,x):
    return f.max_pool1d(x,x.size(2))
  
  def vertical_pooling(self,x):
    return self.horizontal_pooling(x.transpose(1,2)) 
    
  def compute_batch_stats(self,model_output,batch): # TO BE fixed
    r_q,r_a = model_output
    batch_loss = self.crit(r_q,r_a,batch.label.float()*2.0-1) # the loss wants -1|1 values, input values are 0|1
    sim = f.cosine_similarity(r_q,r_a)
    # compute the decision of the network
    pred = sim.clone()
    pred[sim > self.decision_threshold] = 1
    pred[sim <= self.decision_threshold] = 0
    correct_pred = (pred.squeeze() == batch.label.float()).sum().float()
    return sim,batch_loss,correct_pred

  
class AP_biLSTM(nn.Module):
  
  def __init__(self,emb_dim,dict_size,single_hidden_dim,data,decision_threshold=0.5):
    super(AP_biLSTM,self).__init__()
    self.emb = GloVeEmb(emb_dim,dict_size,data)
    self.bilstm_q = LSTMModule(emb_dim,single_hidden_dim)
    self.bilstm_a = LSTMModule(emb_dim,single_hidden_dim)
    self.attention_mat = AttentionMatrix(single_hidden_dim*2)
    self.h_pool = lambda t : self.horizontal_pooling(t)
    self.v_pool = lambda t : self.vertical_pooling(t)
    self.crit = nn.CosineEmbeddingLoss(margin=0.5)
    self.decision_threshold = decision_threshold
    self.tostring = {"name":"ap_bilstm","emb_dim":emb_dim,"hidden_dim":single_hidden_dim*2}
    
  # Work in progress
  def forward(self,q,a):
    q_embed = self.emb(q)
    a_embed = self.emb(a)
    q_enc = self.bilstm_q(q_embed).transpose(1,2) #transposing in order to have (emb,lenght) for the attention matrix
    a_enc = self.bilstm_a(a_embed).transpose(1,2)
    mat = self.attention_mat(q_enc,a_enc)
    # check attention mat dimensions
    q_att = f.softmax(self.h_pool(mat),dim=1)
    a_att = f.softmax(self.v_pool(mat),dim=1)
    q = self.flatten(torch.matmul(q_enc,q_att))
    a = self.flatten(torch.matmul(a_enc,a_att))
    return q,a
    
  def flatten(self,x):
    return x.view(x.size(0),-1)
  
  def horizontal_pooling(self,x):
    return f.max_pool1d(x,x.size(2))
  
  def vertical_pooling(self,x):
    return self.horizontal_pooling(x.transpose(1,2)) 
  
  def compute_batch_stats(self,model_output,batch): # TO BE fixed
    r_q,r_a = model_output
    batch_loss = self.crit(r_q,r_a,batch.label.float()*2.0-1) # the loss wants -1|1 values, input values are 0|1
    sim = f.cosine_similarity(r_q,r_a)
    # compute the decision of the network
    pred = sim.clone()
    pred[sim > self.decision_threshold] = 1
    pred[sim <= self.decision_threshold] = 0
    correct_pred = (pred.squeeze() == batch.label.float()).sum().float()
    return sim,batch_loss,correct_pred
    
  
def test(model, data, mode='test'):
    if mode == 'dev':
        iterator = iter(data.dev_iter)
    else:
        iterator = iter(data.test_iter)
    model.eval()
    
    correct_predictions, loss, number_of_samples = 0, 0, 0
    for batch in iterator:
        q,a = 'question', 'sentence'
        q,a = getattr(batch, q), getattr(batch, a)
        model_output = model(q,a)
        
        output, batch_loss, batch_correct_predictions = model.compute_batch_stats(model_output,
                                                                                  batch)

        correct_predictions += batch_correct_predictions
        number_of_samples += len(output)
        loss += batch_loss.item()
    model.train()
    acc = correct_predictions / number_of_samples
    acc = acc.cpu().item()
    return loss, acc
  
def tostring(model,batch_size):
  s = model.tostring["name"]
  for key in model.tostring:
    if key != "name":
      s += "_"+key+str(model.tostring[key])
  return s+"_bs"+str(batch_size)

In [47]:
emb_dim = 300
hidden_dim = 141# 4000
context_window = 3
batch_size = 20
print_freq = 5

from enum import Enum
available_datasets = Enum("available_datasets","wiki insurance")
chosen_ds = available_datasets.wiki
wiki_base_dir = '/content/gdrive/My Drive/WikiQA/'
insurance_base_dir ='/content/gdrive/My Drive/InsuranceQA/'

if chosen_ds == available_datasets.wiki:
  base_dir = wiki_base_dir
  dataset = WikiQA_dataset(wiki_base_dir,emb_dim,batch_size)  
elif chosen_ds == available_datasets.insurance:
  base_dir = insurance_base_dir
  dataset = InsuranceQA_dataset(insurance_base_dir,emb_dim,batch_size)


dict_size = len(dataset.TEXT.vocab)

# model = BaselineQACNN(emb_dim,dict_size,hidden_dim,context_window,dataset)
# model = BaselineQAbiLSTM(emb_dim,dict_size,hidden_dim,dataset)
# model = AP_CNN(emb_dim,dict_size,hidden_dim,context_window,dataset)
model = AP_biLSTM(emb_dim,dict_size,hidden_dim,dataset)

print(model)

if torch.cuda.is_available():
  print("CUDA available")
  model = model.cuda()
  
learning_rate = 1.1
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate) # Adam
loss = 0
max_dev_acc, max_test_acc = 0, 0

model.train()
  
for i,batch in enumerate(dataset.train_iter):
  q,a = getattr(batch,'question'),getattr(batch,'sentence')
  optimizer.zero_grad()
  res = model(q,a)
  out,batch_loss,correct_predictions = model.compute_batch_stats(res,batch)

  loss += batch_loss.item()
  batch_loss.backward()
  optimizer.step()
  if (i + 1) % print_freq == 0:
    dev_loss, dev_acc = test(model,dataset, mode='dev')
    test_loss, test_acc = test(model,dataset)
    c = (i + 1) // print_freq
    print('[INFO] train loss: {:.2f} / dev loss: {:.2f} / test loss: {:.2f}'
          ' / dev acc: {:.2f} / test acc: {:.2f}'.format(loss, dev_loss,
                                                         test_loss,
                                                         dev_acc * 100,
                                                         test_acc * 100))
    '''
    if dev_acc > max_dev_acc:
          max_dev_acc = dev_acc
          max_test_acc = test_acc
          torch.save(model.state_dict(),base_dir+tostring(model,batch_size)+'.pt')
          with open(base_dir+'perf.dat','w') as file:
            file.write('train loss: {:.2f} / dev loss: {:.2f} / test loss: {:.2f}'
                      ' / dev acc: {:.2f} / test acc: {:.2f}'.format(loss, dev_loss,
                                                         test_loss,
                                                         dev_acc * 100,
                                                         test_acc * 100))
    '''
    loss = 0



AP_biLSTM(
  (emb): GloVeEmb(
    (emb): Embedding(63328, 300)
  )
  (bilstm_q): LSTMModule(
    (bilstm): LSTM(300, 141, bidirectional=True)
  )
  (bilstm_a): LSTMModule(
    (bilstm): LSTM(300, 141, bidirectional=True)
  )
  (attention_mat): AttentionMatrix()
  (crit): CosineEmbeddingLoss()
)
CUDA available


  return Variable(arr, volatile=not train)


[INFO] train loss: 0.31 / dev loss: 5.66 / test loss: 11.46 / dev acc: 94.33 / test acc: 94.69
[INFO] train loss: 0.21 / dev loss: 5.42 / test loss: 11.21 / dev acc: 93.64 / test acc: 92.74
[INFO] train loss: 0.18 / dev loss: 5.51 / test loss: 11.47 / dev acc: 93.93 / test acc: 93.05
[INFO] train loss: 0.26 / dev loss: 5.46 / test loss: 11.42 / dev acc: 93.49 / test acc: 92.37
[INFO] train loss: 0.13 / dev loss: 5.33 / test loss: 11.31 / dev acc: 93.53 / test acc: 91.99
[INFO] train loss: 0.17 / dev loss: 5.35 / test loss: 11.30 / dev acc: 94.00 / test acc: 92.95
[INFO] train loss: 0.26 / dev loss: 5.36 / test loss: 11.32 / dev acc: 94.44 / test acc: 93.98
[INFO] train loss: 0.28 / dev loss: 5.12 / test loss: 10.76 / dev acc: 93.34 / test acc: 92.51
[INFO] train loss: 0.07 / dev loss: 5.05 / test loss: 10.82 / dev acc: 93.05 / test acc: 91.68
[INFO] train loss: 0.19 / dev loss: 5.08 / test loss: 10.76 / dev acc: 92.87 / test acc: 92.23
[INFO] train loss: 0.13 / dev loss: 5.17 / test lo

KeyboardInterrupt: ignored


---

*Draft snippets*

---

In [0]:
l = list(wiki_dataset.train.label)[1:]
l = [int(_) for _ in l]
import numpy as np
np.sum(l)
len(l)

In [0]:
from torchvision import datasets,transforms
data = datasets.MNIST('.',train=True,download=True,transform= transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]))

In [0]:
from google.colab import files
files.download('net.pt')

In [0]:
class uq(nn.Module):
  def __init__(self,dim):
    super(uq,self).__init__()
    self.dim = dim
    
  def forward(self,x):
    return x.unsqueeze(dim=self.dim)
  
  
model = nn.Sequential(uq(1),nn.Conv2d(1,1,(2,3)))
import numpy as np

x = torch.from_numpy(np.random.randint(20,size=(30,2,3))).float()
model(x).size()

In [28]:
import numpy as np
q = torch.from_numpy(np.random.randint(20,size=(30,4,5))).type(torch.FloatTensor)
u = torch.Tensor(5,5).type(torch.FloatTensor)
a = torch.from_numpy(np.random.randint(20,size=(30,5,9))).type(torch.FloatTensor)

torch.matmul(torch.matmul(q,u),a).size()

torch.Size([30, 4, 9])

In [30]:
nn.Parameter(torch.Tensor(emb_dim,emb_dim).type(torch.FloatTensor),requires_grad=True)
torch.from_numpy(np.zeros((3,3))).type(torch.FloatTensor)

tensor([[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]])