In [2]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Dirichlet
from torch import LongTensor
import torch
import torch.nn.functional as F
import sys
import re
import math
from collections import Counter
from tqdm import tqdm
from itertools import islice
import numpy as np
from torch.autograd import Variable
import pandas as pd

from torchtext import data
from torchtext import datasets
import torchtext
from torchtext.vocab import Vectors
from torchtext.data.iterator import BPTTIterator, Iterator, BucketIterator
from torchtext.data import Batch, Dataset, Field
from torch.utils.data import DataLoader
from namedtensor import ntorch
from namedtensor.text import NamedField
import os

In [3]:
def get_batch(batch):
    data = batch.text.transpose(0,1)
    X,y = data[:,:-1],data[:,1:]
    return X,y

In [4]:
TEXT = Field()
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(
    path=".", 
    train="train.txt", validation="valid.txt", test="valid.txt", text_field=TEXT)

# Data distributed with the assignment
TEXT.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))

len(TEXT.vocab) 10001


In [5]:
seqlen = 80
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=24, device='cuda', bptt_len=seqlen, repeat=False)

In [6]:
it = iter(train_iter)
batch = next(it) 

In [7]:
print("Size of text batch [max bptt length, batch size]", batch.text.size(), file=sys.stderr)
print("Second in batch", batch.text[:, 2], file=sys.stderr)
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]), file=sys.stderr)
batch = next(it)
print("Converted back to string: ", " ".join([TEXT.vocab.itos[i] for i in batch.text[:, 2].data]), file=sys.stderr)

Size of text batch [max bptt length, batch size] torch.Size([80, 24])
Second in batch tensor([   0, 5456,    3, 4982,   97,  956,    7, 6420,    0, 5447,    0,  750,
          28,  116,   12,    0, 2278,    0, 4821,   10,   81,    6,   13,    4,
           7,   63,   37,   13,    4,   22,   21,   13,    4,    7,   63,   37,
          13,    4,   22,   79,  205,  288,   16,   15,   43,   26, 1575,    6,
           0, 4982,   10, 1068,  209,    3,    2,   62,  288,   25,   33, 2488,
           3, 4982,   10,  142, 4124,    0,   16,    2,  496,  784,    5, 1320,
        1273,    6, 4982,    3,    8,  410,   19,    0], device='cuda:0')
Converted back to string:  <unk> meridian <eos> ratners group plc a fast-growing <unk> london-based <unk> raised its price for <unk> specialty <unk> weisfield 's inc. to $ N a share or $ N million from $ N a share or $ N million after another concern said it would be prepared to <unk> ratners 's initial offer <eos> the other concern was n't identified <eos> 

In [8]:
# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) 
word2vec = TEXT.vocab.vectors


In [9]:
filters = word2vec.shape[1]
embed_size = word2vec.shape[1]
n_words = word2vec.shape[0]

# Model code

In [10]:
LSM = nn.LogSoftmax(dim=2)
from torch.nn.utils import weight_norm
### adapted from https://github.com/locuslab/TCN/ ###
class Chomp1d(torch.nn.Module):
    '''Ensure causal convolutions by removing right most items'''
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()
    
    
class TC_block(torch.nn.Module):
    def __init__(self, n_in, n_out, kernel, stride, dilation, padding, dropout=0.2):
        super(TC_block, self).__init__()
        self.conv1 = weight_norm(torch.nn.Conv1d(n_in, n_out, kernel,stride=stride,
                                             padding=padding, dilation=dilation))
      
        self.conv2 = weight_norm(nn.Conv1d(n_out, n_out, kernel,stride=stride,
                                           padding=padding, dilation=dilation))
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
    
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
    
        self.chomp1 = Chomp1d(padding)
        self.chomp2 = Chomp1d(padding)
    
        self.block = torch.nn.Sequential(self.conv1,self.chomp1,self.relu1,self.dropout1,
                                          self.conv2,self.chomp2,self.relu2,self.dropout2)
        self.relu = nn.ReLU()
    
        if n_in != n_out:
            self.conv_re = nn.Conv1d(n_in,n_out,kernel_size=1,stride=1,padding=0)
    def forward(self, x):
        out = self.block(x)
        # skip connection
        if x.shape[1]!=out.shape[1]:
            x = self.conv_re(x)
        return self.relu(out + x)
      
      
      
class TCN(torch.nn.Module):
    def __init__(self, n_layers, n_filters, kernel=2, dropout=0.2, embedding_size = 1000, n_words = 10001,
                tied=True,embedding = None):  
        super(TCN, self).__init__()
        blocks = []
        self.embedding_size = embedding_size
        self.n_words = n_words
        if embedding is None:
            self.embedding = nn.Embedding(self.n_words,self.embedding_size)
        else:
            self.embedding = nn.Embedding(self.n_words,self.embedding_size)
            self.embedding.data = embedding
        self.n_filters = [self.embedding_size] + n_filters
        
                
        for i in range(1,n_layers):
            dilation = 2 ** i
            n_in = self.n_filters[i-1]
            n_out = self.n_filters[i]
            blocks.append(TC_block(n_in, n_out, kernel, stride=1, dilation=dilation, padding=(kernel-1) * dilation, dropout=dropout))
            
        self.network = nn.Sequential(*blocks)
        self.receptive_field = 1 + 2*(kernel-1)*(2 ** n_layers-1) + 1
        self.output_layer = nn.Linear(n_filters[-1], n_words)
        self.relu = nn.ReLU()
        if tied:
            self.output_layer.weight = self.embedding.weight
        self.drop = nn.Dropout(0.25)
        self.init_weights()
    def init_weights(self):
        self.embedding.weight.data.normal_(0, 0.01)
        self.output_layer.bias.data.fill_(0)
        self.output_layer.weight.data.normal_(0, 0.01)

    def forward(self, x):
        embed = self.drop(self.embedding(x))
        hook = self.network(embed.transpose(1,2)).transpose(1,2)
        return self.output_layer(hook)

# Build model and optimizers

In [22]:
model = TCN(5, [600,600,600,600], kernel=2, dropout=0.5, embedding_size = 600, n_words = n_words,tied=True)
model.cuda()
model_parameters = filter(lambda x: x.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('Network with {} parameters'.format(params))
print('Receptive field of network is {}'.format(model.receptive_field))
criterion = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,mode="min",patience=5,min_lr=1e-6,factor=0.5)


Network with 11780201 parameters
Receptive field of network is 64


# Training loop

In [23]:
def train_loop(e=0):
    model.train()
    batch_idx = 0
    for batch in train_iter:
        X,y = get_batch(batch)
        prob = model(X)
        # skip some chars for loss
        skip = int(X.shape[1]/2)
        target = y[:, skip:].contiguous()
        output = prob[:, skip:,:].contiguous().transpose(1,2)
        loss = criterion(output, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.35)
        optimizer.step()
        batch_idx +=1
        if np.mod(batch_idx,100) == 0:
            batch_size = X.shape[0]
            ppl = np.exp(loss.cpu().detach() / (batch_size * (seqlen-skip))) # update
            acc = torch.sum(torch.argmax(prob.cpu().detach(),dim=2) == y.cpu().detach()).float() / torch.FloatTensor([batch_size*(seqlen-skip)])
            print('Epoch: %d, Batch: %d, loss: %.4f , Train PPL: %.4f, Train Acc: %.4f' % (e, batch_idx, loss.cpu().detach(), ppl, acc))



# Validation loop

In [24]:
def validation_loop(e=0):
    model.eval()
    batch_idx = 0
    ppl = 0
    acc = 0
    total_loss = 0
    for batch in val_iter:
        X,y = get_batch(batch)
        prob = model(X)
        # skip some chars for loss
        skip = int(X.shape[1]/2)
        target = y[:, skip:].contiguous()
        output = prob[:, skip:,:].contiguous().transpose(1,2)
        total_loss += criterion(output, target).cpu().detach()
        batch_idx +=1
        batch_size = X.shape[0]
        ppl += np.exp(total_loss.cpu().detach() / (batch_size * (seqlen-skip))) # update
        acc += torch.sum(torch.argmax(prob.cpu().detach(),dim=2) == y.cpu().detach()).float() / torch.FloatTensor([batch_size*(seqlen-skip)])
    print('Validation --- Epoch: %d, total loss: %.4f , PPL: %.4f, Acc: %.4f' % (e, total_loss.cpu().detach(), ppl/batch_idx, acc/batch_idx))
    return total_loss

In [None]:
best_vloss = 1e8
for e in range(100):
    train_loop(e)
    validation_loss = validation_loop(e)
    scheduler.step(validation_loss)
    print('lr = {}'.format(optimizer.state_dict()['param_groups'][0]['lr']))
    if validation_loss < best_vloss:
                with open("model_5_layers.pt", 'wb') as f:
                    print('wrote model')
                    torch.save(model, f)
                best_vloss = validation_loss

Epoch: 0, Batch: 100, loss: 6.7071 , Train PPL: 1.0068, Train Acc: 0.1331


In [43]:
model_saved = torch.load('model.pt')

In [46]:
def make_predictions_for_kaggle(model):
    sentences = []
    for i, l in enumerate(open("/home/amaro/cs287/hw2/input.txt"), 1):
        sentences.append(re.split(' ', l))
    tokenized = []
    for s in sentences:
        t_s = []
        for w in s:
            if w != '___\n':
                t_s.append(TEXT.vocab.stoi[w])
        tokenized.append(t_s)
    
    predictions = []
    for i in tokenized:
        X = torch.tensor(i,dtype=torch.long,device='cuda')
        pred = torch.squeeze(model(torch.unsqueeze(X,0)))[-1,:]
        tokens = torch.argsort(pred,descending=True)[:20]
        l_  = [TEXT.vocab.itos[j] for j in tokens]
        predictions.append(' '.join(l_))
    
    out = pd.DataFrame(index=range(1,len(predictions)+1))
    out.index.names = ['id']
    out['word'] = predictions
    out.to_csv('/home/amaro/cs287/hw2/predictions_2_TCN.txt',sep=',')

In [49]:
make_predictions_for_kaggle(model_saved)