In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Dirichlet

import re
import math
from collections import Counter
from tqdm import tqdm
from itertools import islice
import numpy as np

import torchtext
from torchtext.vocab import Vectors
from torchtext.data.iterator import BPTTIterator
from torchtext.data import Batch, Dataset

from namedtensor import ntorch
from namedtensor.text import NamedField

In [2]:
# Our input $x$
TEXT = NamedField(names=("seqlen",))
# Data distributed with the assignment
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)
TEXT.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))

class NamedBpttIterator(BPTTIterator):
    def __iter__(self):
        text = self.dataset[0].text
        TEXT = self.dataset.fields['text']
        TEXT.eos_token = None
        text = text + ([TEXT.pad_token] * int(math.ceil(len(text) / self.batch_size)
                                              * self.batch_size - len(text)))
        data = TEXT.numericalize(
            [text], device=self.device)
        data = (data
            .stack(("seqlen", "batch"), "flat")
            .split("flat", ("batch", "seqlen"), batch=self.batch_size)
            .transpose("seqlen", "batch")
        )

        dataset = Dataset(examples=self.dataset.examples, fields=[
            ('text', TEXT), ('target', TEXT)])
        while True:
            for i in range(0, len(self) * self.bptt_len, self.bptt_len):
                self.iterations += 1
                seq_len = min(self.bptt_len, len(data) - i - 1)
                yield Batch.fromvars(
                    dataset, self.batch_size,
                    text = data.narrow("seqlen", i, seq_len),
                    target = data.narrow("seqlen", i+1, seq_len),
                )
                         
            if not self.repeat:
                return

len(TEXT.vocab) 10001


In [3]:
train_iter, val_iter, test_iter = NamedBpttIterator.splits(
    (train, val, test), batch_size=10, device=torch.device("cuda"), bptt_len=32, repeat=False)

In [11]:
LSM = nn.LogSoftmax(dim=1)
from torch.nn.utils import weight_norm
### adapted from https://github.com/locuslab/TCN/ ###
class Chomp1d(torch.nn.Module):
    '''Ensure causal convolutions by removing right most items'''
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()
    
    
class TC_block(torch.nn.Module):
    def __init__(self, n_in, n_out, kernel, stride, dilation, padding, dropout=0.5):
        super(TC_block, self).__init__()
        self.conv1 = weight_norm(torch.nn.Conv1d(n_in, n_out, kernel,stride=stride,
                                             padding=padding, dilation=dilation))
      
        self.conv2 = weight_norm(nn.Conv1d(n_out, n_out, kernel,stride=stride,
                                           padding=padding, dilation=dilation))
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
        self.chomp1 = Chomp1d(padding)
        self.chomp2 = Chomp1d(padding)
    
        self.block = torch.nn.Sequential(self.conv1,self.chomp1,self.relu1,self.dropout1,
                                          self.conv2,self.chomp2,self.relu2,self.dropout2)
        self.relu = nn.ReLU()
    
        if n_in != n_out:
            self.conv_re = nn.Conv1d(n_in,n_out,kernel_size=1,stride=1,padding=0)
    def forward(self, x):
        out = self.block(x)
        # skip connection
        if x.shape[1]!=out.shape[1]:
            x = self.conv_re(x)
        return self.relu(out + x)
      
      
      
class TCN(torch.nn.Module):
    def __init__(self, n_layers, n_filters, kernel=2, dropout=0.2, embedding_size = 600, n_words = 10001):  
        super(TCN, self).__init__()
        blocks = []
        self.embedding_size = embedding_size
        self.n_words = n_words
        self.embedding = nn.Embedding(self.n_words,self.embedding_size)
    
        self.n_filters = [self.embedding_size] + n_filters
    
        for i in range(1,n_layers):
            dilation = 2 ** i
            n_in = self.n_filters[i-1]
            n_out = self.n_filters[i]
            blocks.append(TC_block(n_in, n_out, kernel, stride=1, dilation=dilation, padding=(kernel-1) * dilation, dropout=0.2))
            
        self.network = nn.Sequential(*blocks)
        self.receptive_field = 1 + 2*(kernel-1)*(2 ** n_layers-1) + 1
        self.output_layer = nn.Linear(n_filters[-1], n_words)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        embed = self.embedding(x)
        hook = self.network(embed.transpose(1,2))
        return self.output_layer(hook.transpose(1,2)).transpose(1,2)

In [12]:
tc_net = TCN(3,[600,600,600])
model_parameters = filter(lambda p: p.requires_grad, tc_net.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('Network with {} parameters'.format(params))
print('Receptive field of network is {}'.format(tc_net.receptive_field))

Network with 14896001 parameters
Receptive field of network is 16


In [33]:
tc_net.cuda()
optimizer = torch.optim.Adam(tc_net.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss(reduction='sum')
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="min",patience=4,min_lr=1e-6)

def training_loop(net,train_iter,optimizer,criterion,scheduler,e=0):
    net.train()
    for i,b in enumerate(train_iter):
        optimizer.zero_grad()
        data = torch.transpose(b.text.values, dim0=0, dim1=1)
        X = data[:,:-1]
        y = data[:,1:]
        prob = net(X)
        loss = criterion(prob,y)
        loss.backward()
        optimizer.step()
        if np.mod(i,500)==0:
            batch_size = X.shape[0]
            ppl = np.exp(loss.cpu().detach() / (batch_size * 31)) # update
            acc = torch.sum(torch.argmax(prob.cpu().detach(),dim=1) == y.cpu().detach()).float() / torch.FloatTensor([batch_size*31])
            print('Epoch: %d, Batch: %d, loss: %.4f , Train PPL: %.4f, Train Acc: %.4f' % (e, i, loss.cpu().detach(), ppl, acc))
          

In [32]:
training_loop(tc_net,train_iter,optimizer,criterion,scheduler)

Epoch: 0, Batch: 0, loss: 1242.5107 , Train PPL: 55.0421, Train Acc: 0.1968
Epoch: 0, Batch: 500, loss: 3966.8901 , Train PPL: 360923.0625, Train Acc: 0.1097
Epoch: 0, Batch: 1000, loss: 5550.9492 , Train PPL: 59786364.0000, Train Acc: 0.1129
Epoch: 0, Batch: 1500, loss: 5202.0132 , Train PPL: 19398132.0000, Train Acc: 0.1387
Epoch: 0, Batch: 2000, loss: 4880.7241 , Train PPL: 6880977.5000, Train Acc: 0.1065
Epoch: 0, Batch: 2500, loss: 2915.7388 , Train PPL: 12156.3770, Train Acc: 0.1355


In [59]:
for e in range(20):
    h0 = None
    net.train()
    for i,b in enumerate(train_iter):
        optimizer.zero_grad()
        data = torch.transpose(b.text.values, dim0=0, dim1=1)
        X = data[:,:-1]
        y = data[:,1:]
    
    #X, y = make_batch(b, b_size=b.text.shape['batch'], b_seq_len=b.text.shape['seqlen'], batch_seq_len=10)
    #h0 = hidden[:X.shape[0],:] # ensures h0 has same batch dim as X
        prob, hidden = net(X,h0)
        #h0 = tuple(h.detach() for h in hidden)
        loss = criterion(prob.transpose(1,2),y)
        loss.backward()
        optimizer.step()
    
        if np.mod(i,500)==0:
            batch_size = X.shape[0]
            ppl = np.exp(loss.cpu().detach() / (batch_size * 31)) # update
            acc = torch.sum(torch.argmax(prob.cpu().detach(),dim=2) == y.cpu().detach()).float() / torch.FloatTensor([batch_size*31])
            print('Epoch: %d, Batch: %d, loss: %.4f , Train PPL: %.4f, Train Acc: %.4f' % (e, i, loss.cpu().detach(), ppl, acc))
            #print([p.grad for p in net.parameters()])
    net.eval()
    acc_val = []
    ppl_val = []
    print('Running validation')
    for i,b in enumerate(val_iter):
        data = torch.transpose(b.text.values, dim0=0, dim1=1)
        X = data[:,:-1]
        y = data[:,1:]
        prob, hidden = net(X, h0)
        #h0 = tuple(h.detach() for h in hidden)
        #h0 = [h.data for h in hidden]
        loss = criterion(prob.transpose(1,2),y)
        ppl_val.append(np.exp(loss.cpu().detach() / (batch_size * 31))) # update
        acc_val.append(torch.sum(torch.argmax(prob.cpu().detach(),dim=2) == y.cpu().detach()).float() / torch.FloatTensor([batch_size*31]))
    scheduler.step(torch.mean(torch.stack(ppl_val)))
    print('Epoch: %d, Val PPL: %.4f, Val Acc: %.4f' % (e,torch.mean(torch.stack(ppl_val)), torch.mean(torch.stack(acc_val))))
    

Epoch: 0, Batch: 0, loss: 273250.9062 , Train PPL: inf, Train Acc: 0.0097
Epoch: 0, Batch: 500, loss: 229993.7812 , Train PPL: inf, Train Acc: 0.0065


KeyboardInterrupt: 

In [44]:
# load test set

import pandas as pd
    
def write_predictions(net,output_file):
    sentences = []
    for i, l in enumerate(open("input.txt"), 1):
        sentences.append(re.split(' ', l))
    # make predictions
    predictions = []

    for i in range(len(sentences)):
        s = torch.tensor([TEXT.vocab.stoi[j] for j in sentences[i]]).cuda()
        prob, hidden = net(torch.unsqueeze(s, 0))
        top_idx = torch.squeeze(torch.argsort(prob[:,-1,:], descending=True))[:20]
        l_  = [TEXT.vocab.itos[j] for j in top_idx]
        predictions.append(' '.join(l_))
      
    if i % 100 == 0:
        print(i, '/', len(sentences),end="\r",flush=True)
        
    out = pd.DataFrame(index=range(len(predictions)))
    out.index.names = ['id']
    out['word'] = predictions
    out.to_csv('predictions_model3.txt',sep=',')

In [48]:
train_loss = 0
train_words = 0

for b in iter(train_iter):
  data = torch.transpose(b.text.values, dim0=0, dim1=1)
  X = data[:,:-1]
  y = data[:,1:]
  prob, hidden = net(X)
  train_loss += criterion(prob.transpose(1,2), y).detach()
  train_words += X.shape[0] * X.shape[1]
  
train_ppl = torch.exp(train_loss / train_words)
print(train_ppl)

tensor(408.5605, device='cuda:0')


In [40]:
import pandas as pd


[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0'),
 tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0'),
 tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0'),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
      