In [1]:
import torch
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn as nn
import torch.nn.functional as F

from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.data import BucketIterator

import random #teacher forcing

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
PROJECT_NAME = 'cassandra'
BATCH_SIZE = 64

In [4]:
#set up fields
BODY = Field()
NAME = Field()

fields = {'name': ('name', NAME), 'body': ('body', BODY)}

In [5]:
#get data from json
train, test = TabularDataset.splits(
                path = 'data',
                train = f'{PROJECT_NAME}_train.json',
                test = f'{PROJECT_NAME}_test.json',
                format = 'json',
                fields = fields
)

In [6]:
print('train.fields', train.fields)
print('len(train)', len(train))

train.fields {'name': <torchtext.data.field.Field object at 0x7f9e631b8438>, 'body': <torchtext.data.field.Field object at 0x7f9e631b8940>}
len(train) 11490


In [7]:
BODY.build_vocab(train.body, train.name)
NAME.build_vocab(train.body, train.name)

In [8]:
print(len(BODY.vocab))
print(len(NAME.vocab))

12236
12236


In [9]:
print(BODY.vocab.freqs.most_common(10))

[('(', 94167), (')', 94167), ('.', 67730), (';', 55698), (',', 46923), ('{', 20473), ('}', 20473), ('=', 18020), ('get', 11897), ('<sentence_start>', 11490)]


In [10]:
# make iterator for splits
train_iter, test_iter = BucketIterator.splits(
    (train, test), 
    batch_size=BATCH_SIZE, 
    sort_key=lambda x: len(x.name),
    repeat=False)

In [11]:
class AttentionFeatures(nn.Module):
    """
    Page 3 of the paper
    attention_features (code tokens c, context h_{t-1})
     C <- lookupandpad(c, E)
     L1 <- ReLU(Conv1d(C, K_{l1}))
     L2 <- Conv1d(L1, K_{l2}) * h_{t-1}
     Lfeat <- L2/||L2||_2
     return Lfeat
    """
    def __init__(self, embedding_dim, k1, w1, k2, w2, w3, dropout, prelu):
        super(AttentionFeatures, self).__init__()
                
        self.w1 = w1
        self.k1 = k1

        self.w2 = w2
        self.k2 = k2

        #self.w3 = w3 #use this to calculate padding

        self.conv1 = nn.Conv1d(embedding_dim, k1, w1)
        self.conv2 = nn.Conv1d(k1, k2, w2)
        self.do = nn.Dropout(dropout)
        self.relu = nn.PReLU() if prelu == True else F.relu

    def forward(self, C, h_t):
        
        #C = embedded body tokens
        #h_t = previous hidden state used to predict name token
        
        #C = [bodies len, batch size, emb dim]
        #h_t = [1, batch size, k2]
        
        C = C.permute(1, 2, 0) #input to conv needs n_channels as dim 1
        
        #C = [batch size, emb dim, bodies len]
        
        h_t = h_t.permute(1, 2, 0) #from [1, batch size, k2] to [batch size, k2, 1]
        
        #h_t = [batch size, k2, 1]
        
        L_1 = self.do(self.relu(self.conv1(C)))
        
        #L_1 = [batch size, k1, bodies len - w1 + 1]
        
        L_2 = self.do(self.conv2(L_1)) * h_t
                
        #L_2 = [batch size, k2, bodies len - w1 - w2 + 2]
        
        L_feat = F.normalize(L_2, p=2, dim=1)
                
        #L_feat = [batch size, k2, bodies len - w1 - w2 + 2]
                
        return L_feat

In [12]:
class AttentionWeights(nn.Module):
    """
    Page 3 of the paper
    attention_features (attention features Lfeat, kernel K)
     return Softmax(Conv1d(Lfeat, K))
    """
    def __init__(self, k2, w3, dropout):
        super(AttentionWeights, self).__init__()

        self.conv1 = nn.Conv1d(k2, 1, w3)
        self.do = nn.Dropout(dropout)

    def forward(self, L_feat):
                
        #L_feat = [batch size, k2, bodies len - w1 - w2 + 2]
        
        x = self.do(self.conv1(L_feat))
        
        #x = [batch size, 1, bodies len - w1 - w2 - w3 + 3]
        
        x = x.squeeze(1)
        
        #x = [batch size, bodies len - w1 - w2 - w3 + 3]
        
        x = F.softmax(x, dim=1)
                
        #x = [batch size, bodies len - w1 - w2 - w3 + 3]
                
        return x

In [13]:
class ConvAttentionNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, k1, k2, w1, w2, w3, dropout, prelu, pad_idx):
        super(ConvAttentionNetwork, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.k1 = k1
        self.k2 = k2
        self.w1 = w1
        self.w2 = w2
        self.w3 = w3
        self.dropout = dropout
        self.prelu = prelu
        self.pad_idx = pad_idx

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.do = nn.Dropout(dropout)
        self.gru = nn.GRU(embedding_dim, k2)
        self.attn_feat = AttentionFeatures(embedding_dim, k1, w1, k2, w2, w3, dropout, prelu)
        self.attn_weights = AttentionWeights(k2, w3, dropout)
        self.bias = nn.Parameter(torch.ones(vocab_size))
        
        n_padding = w1 + w2 + w3 - 3
        self.padding = torch.zeros(n_padding, 1).fill_(pad_idx).long()
        
    def forward(self, bodies, names):
        
        #bodies = [bodies len, batch size]
        #names = [names len, batch size]  
        
        #stores the probabilities generated for each token
        outputs = torch.zeros(names.shape[0], names.shape[1], self.vocab_size).to(names.device)
        
        #outputs = [name len, batch size, vocab dim]
        
        bodies_padded = torch.cat((bodies, self.padding.expand(-1, bodies.shape[1]).to(bodies.device)))
        
        #bodies_padded = [bodies len + w1 + w2 + w3 - 3, batch_size]
        
        emb_b = self.embedding(bodies_padded)
        
        #emb_b = [bodies len, batch size, emb dim]
                
        #first input to gru is <sos> token
        output = names[0]
            
        for i in range(1, names.shape[0]):
            
            #initial hidden state is rnn applied to the <sos> token
            _, h_t = self.gru(self.embedding(output).unsqueeze(0))

            #h_t = [1, batch size, k2]

            L_feat = self.attn_feat(emb_b, h_t)

            #L_feat = [batch size, k2, bodies len - w1 - w2 + 2]

            alpha = self.attn_weights(L_feat)

            #alpha = [batch size, bodies len - w1 - w2 - w3 + 3]

            emb_b_slice = emb_b.permute(1, 0, 2)[:, :bodies.shape[0], :]

            #emb_b = [batch_size, bodies len, emb dim]

            n_hat = torch.sum(alpha.unsqueeze(2) * emb_b_slice, dim=1)

            #n_hat = [batch size, emb dim]

            E = self.embedding.weight.unsqueeze(0).expand(bodies.shape[1],-1,-1)

            #E = [batch size, vocab size, emb dim]

            n = torch.bmm(E, n_hat.unsqueeze(2)).squeeze(2) + self.bias.unsqueeze(0).expand(bodies.shape[1], -1)
            
            #n = [batch size, vocab size]
            
            outputs[i] = n
            
            #teacher forcing ratio is equal to dropout
            if random.random() < self.dropout:
                
                top1 = n.max(1)[1]
                output = top1
                
            else:
                output = names[i]
                
        return outputs

In [14]:
VOCAB_SIZE = len(BODY.vocab)
EMBEDDING_DIM = 128
K1 = 8
K2 = 8
W1 = 24
W2 = 29
W3 = 10
DROPOUT = 0.5
PRELU = True
PAD_IDX = BODY.vocab.stoi['<pad>']

model = ConvAttentionNetwork(VOCAB_SIZE, EMBEDDING_DIM, K1, K2, W1, W2, W3, DROPOUT, PRELU, PAD_IDX)

model = model.to(device)

In [15]:
#initialize optimizer, scheduler and loss function
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters())

In [16]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        bodies = batch.body
        names = batch.name
        
        optimizer.zero_grad()
        
        output = model(bodies, names)
        
        loss = criterion(output[1:].view(-1, output.shape[2]), names[1:].view(-1))
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            bodies = batch.body
            names = batch.name

            output = model(bodies, names)

            loss = criterion(output[1:].view(-1, output.shape[2]), names[1:].view(-1))

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [18]:
import math
import os
N_EPOCHS = 100
CLIP = 10

best_test_loss = float('inf')

if not os.path.isdir('.save'):
    os.makedirs('.save')

for epoch in range(N_EPOCHS):
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    test_loss = evaluate(model, test_iter, criterion)
    
    if test_loss < best_test_loss:
        best_test_loss = test_loss
        torch.save(model.state_dict(), '.save/model.pt')
    
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):8.2f} | Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):8.2f} |')

  return Variable(arr, volatile=not train)


| Epoch: 01 | Train Loss: 9.204 | Train PPL:  9933.65 | Test Loss: 8.354 | Test PPL:  4246.63 |
| Epoch: 02 | Train Loss: 8.399 | Train PPL:  4442.23 | Test Loss: 7.298 | Test PPL:  1477.10 |
| Epoch: 03 | Train Loss: 7.812 | Train PPL:  2470.23 | Test Loss: 7.042 | Test PPL:  1144.12 |
| Epoch: 04 | Train Loss: 7.349 | Train PPL:  1554.21 | Test Loss: 6.719 | Test PPL:   828.37 |
| Epoch: 05 | Train Loss: 7.007 | Train PPL:  1104.33 | Test Loss: 6.518 | Test PPL:   677.49 |
| Epoch: 06 | Train Loss: 6.705 | Train PPL:   816.36 | Test Loss: 6.202 | Test PPL:   493.71 |
| Epoch: 07 | Train Loss: 6.471 | Train PPL:   646.36 | Test Loss: 6.218 | Test PPL:   501.84 |
| Epoch: 08 | Train Loss: 6.217 | Train PPL:   501.24 | Test Loss: 5.735 | Test PPL:   309.63 |
| Epoch: 09 | Train Loss: 6.068 | Train PPL:   431.70 | Test Loss: 6.565 | Test PPL:   709.96 |
| Epoch: 10 | Train Loss: 5.927 | Train PPL:   375.03 | Test Loss: 5.140 | Test PPL:   170.72 |
| Epoch: 11 | Train Loss: 5.741 | Train 

| Epoch: 87 | Train Loss: 3.149 | Train PPL:    23.31 | Test Loss: 3.304 | Test PPL:    27.21 |
| Epoch: 88 | Train Loss: 3.144 | Train PPL:    23.19 | Test Loss: 3.306 | Test PPL:    27.28 |
| Epoch: 89 | Train Loss: 3.114 | Train PPL:    22.51 | Test Loss: 3.301 | Test PPL:    27.14 |
| Epoch: 90 | Train Loss: 3.097 | Train PPL:    22.13 | Test Loss: 3.289 | Test PPL:    26.81 |
| Epoch: 91 | Train Loss: 3.108 | Train PPL:    22.37 | Test Loss: 3.292 | Test PPL:    26.88 |
| Epoch: 92 | Train Loss: 3.096 | Train PPL:    22.11 | Test Loss: 3.260 | Test PPL:    26.05 |
| Epoch: 93 | Train Loss: 3.076 | Train PPL:    21.67 | Test Loss: 3.286 | Test PPL:    26.72 |
| Epoch: 94 | Train Loss: 3.075 | Train PPL:    21.64 | Test Loss: 3.266 | Test PPL:    26.21 |
| Epoch: 95 | Train Loss: 3.045 | Train PPL:    21.01 | Test Loss: 3.264 | Test PPL:    26.16 |
| Epoch: 96 | Train Loss: 3.050 | Train PPL:    21.11 | Test Loss: 3.230 | Test PPL:    25.29 |
| Epoch: 97 | Train Loss: 3.027 | Train 