In [1]:
import torch
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField()

In [2]:
from torchtext import datasets

train_data, valid_data, test_data = datasets.SNLI.splits(TEXT, LABEL)

In [3]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 549367
Number of validation examples: 9842
Number of testing examples: 9824


In [4]:
print(vars(train_data.examples[0]))

{'premise': ['A', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.'], 'hypothesis': ['A', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.'], 'label': 'neutral'}


In [5]:
MIN_FREQ = 2

TEXT.build_vocab(train_data, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.840B.300d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

In [6]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")

Unique tokens in TEXT vocabulary: 27587


In [7]:
print(TEXT.vocab.freqs.most_common(20))

[('.', 962558), ('a', 922222), ('A', 516769), ('in', 406005), ('is', 373433), ('the', 369849), ('man', 256428), ('on', 235198), ('and', 206278), ('are', 199010), ('of', 192390), ('with', 169013), ('The', 164283), ('woman', 133181), (',', 114331), ('to', 113904), ('at', 97993), ('people', 95937), ('Two', 90409), ('wearing', 80966)]


In [8]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', '.', 'a', 'A', 'in', 'is', 'the', 'man', 'on']


In [9]:
print(LABEL.vocab.itos)

['entailment', 'contradiction', 'neutral']


In [10]:
print(LABEL.vocab.freqs.most_common())

[('entailment', 183416), ('contradiction', 183187), ('neutral', 182764)]


In [11]:
BATCH_SIZE = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

In [12]:
import torch.nn as nn
import torch.nn.functional as F

class NLISum(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embedding_dim,
                 hidden_dim,
                 fc_layers,
                 output_dim, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.translation = nn.Linear(embedding_dim, hidden_dim)
        
        fcs = [nn.Linear(hidden_dim * 2, hidden_dim * 2) for _ in range(fc_layers)]
        
        self.fcs = nn.ModuleList(fcs)
        
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, prem, hypo):

        #prem = [prem sent len, batch size]
        #hypo = [hypo sent len, batch size]
        
        embedded_prem = self.embedding(prem)
        embedded_hypo = self.embedding(hypo)
        
        #embedded_premise = [prem sent len, batch size, embedding dim]
        #embedded_hypothesis = [hypo sent len, batch size, embedding dim]
        
        translated_prem = F.relu(self.translation(embedded_prem))
        translated_hypo = F.relu(self.translation(embedded_hypo))
        
        #embedded_premise = [prem sent len, batch size, hidden dim]
        #embedded_hypothesis = [hypo sent len, batch size, hidden dim]
        
        hidden_prem = translated_prem.sum(dim=0)
        hidden_hypo = translated_hypo.sum(dim=0)
            
        #hidden_x = [batch size, hid dim]
        
        hidden = torch.cat((hidden_prem, hidden_hypo), dim=1)

        #hidden = [batch size, hid dim * 2]
            
        for fc in self.fcs:
            hidden = fc(hidden)
            hidden = F.relu(hidden)
            hidden = self.dropout(hidden)
        
        prediction = self.fc_out(hidden)
        
        #prediction = [batch size, output dim]
        
        return prediction

In [13]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 300
FC_LAYERS = 3
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.25
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = NLISum(INPUT_DIM,
               EMBEDDING_DIM,
               HIDDEN_DIM,
               FC_LAYERS,
               OUTPUT_DIM,
               DROPOUT,
               PAD_IDX)

In [14]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.1)
        
model.apply(init_weights)

SNLI(
  (embedding): Embedding(27587, 300, padding_idx=1)
  (translation): Linear(in_features=300, out_features=300, bias=True)
  (fcs): ModuleList(
    (0): Linear(in_features=600, out_features=600, bias=True)
    (1): Linear(in_features=600, out_features=600, bias=True)
    (2): Linear(in_features=600, out_features=600, bias=True)
  )
  (fc_out): Linear(in_features=600, out_features=3, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,450,003 trainable parameters


In [16]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([27587, 300])


In [17]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ..., -1.4447,  0.8402, -0.8668],
        [ 0.1032, -1.6268,  0.5729,  ...,  0.3180, -0.1626, -0.0417],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350],
        ...,
        [ 0.3759, -0.1128,  0.3870,  ...,  0.3146,  0.1263,  0.9477],
        [ 0.6011,  0.4813,  0.3344,  ...,  0.2632,  0.5857, -0.2197],
        [ 0.0667, -0.1165,  0.0910,  ..., -0.0081, -0.1964, -0.2732]])

In [18]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0120,  0.2075, -0.1258,  ...,  0.1387, -0.3605, -0.0350],
        ...,
        [ 0.3759, -0.1128,  0.3870,  ...,  0.3146,  0.1263,  0.9477],
        [ 0.6011,  0.4813,  0.3344,  ...,  0.2632,  0.5857, -0.2197],
        [ 0.0667, -0.1165,  0.0910,  ..., -0.0081, -0.1964, -0.2732]])


In [19]:
model.embedding.weight.requires_grad = False

In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,173,903 trainable parameters


In [21]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [22]:
criterion = nn.CrossEntropyLoss()

In [23]:
model = model.to(device)
criterion = criterion.to(device)

In [24]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [25]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        prem = batch.premise
        hypo = batch.hypothesis
        labels = batch.label
        
        optimizer.zero_grad()
        
        #prem = [prem sent len, batch size]
        #hypo = [hypo sent len, batch size]
        
        predictions = model(prem, hypo)
        
        #predictions = [batch size, output dim]
        #labels = [batch size]
        
        loss = criterion(predictions, labels)
                
        acc = categorical_accuracy(predictions, labels)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [26]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            prem = batch.premise
            hypo = batch.hypothesis
            labels = batch.label
                        
            predictions = model(prem, hypo)
            
            loss = criterion(predictions, labels)
                
            acc = categorical_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [27]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 12s
	Train Loss: 1.855 | Train Acc: 42.95%
	 Val. Loss: 0.973 |  Val. Acc: 52.50%
Epoch: 02 | Epoch Time: 0m 12s
	Train Loss: 0.933 | Train Acc: 55.26%
	 Val. Loss: 0.874 |  Val. Acc: 61.28%
Epoch: 03 | Epoch Time: 0m 11s
	Train Loss: 0.880 | Train Acc: 59.04%
	 Val. Loss: 0.852 |  Val. Acc: 63.60%
Epoch: 04 | Epoch Time: 0m 11s
	Train Loss: 0.844 | Train Acc: 61.62%
	 Val. Loss: 0.821 |  Val. Acc: 65.66%
Epoch: 05 | Epoch Time: 0m 11s
	Train Loss: 0.816 | Train Acc: 63.49%
	 Val. Loss: 0.807 |  Val. Acc: 65.35%
Epoch: 06 | Epoch Time: 0m 12s
	Train Loss: 0.790 | Train Acc: 65.28%
	 Val. Loss: 0.779 |  Val. Acc: 67.79%
Epoch: 07 | Epoch Time: 0m 11s
	Train Loss: 0.762 | Train Acc: 67.09%
	 Val. Loss: 0.762 |  Val. Acc: 68.23%
Epoch: 08 | Epoch Time: 0m 11s
	Train Loss: 0.740 | Train Acc: 68.37%
	 Val. Loss: 0.731 |  Val. Acc: 69.94%
Epoch: 09 | Epoch Time: 0m 11s
	Train Loss: 0.724 | Train Acc: 69.22%
	 Val. Loss: 0.713 |  Val. Acc: 70.50%
Epoch: 10 | Epoch T

In [29]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.687 |  Test Acc: 72.21%


In [30]:
class NLIRNN(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 encode_method,
                 embedding_dim,
                 hidden_dim,
                 fc_layers,
                 output_dim, 
                 dropout, 
                 pad_idx):
        
        super().__init__()
                
        assert encode_method in {'gru', 'lstm'}
        
        self.encode_method = encode_method
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.translation = nn.Linear(embedding_dim, hidden_dim)
        
        if encode_method == 'gru':
            self.rnn = nn.GRU(hidden_dim, hidden_dim)
        elif encode_method == 'lstm':
            self.rnn = nn.LSTM(hidden_dim, hidden_dim)
        
        fcs = [nn.Linear(hidden_dim * 2, hidden_dim * 2) for _ in range(fc_layers)]
        
        self.fcs = nn.ModuleList(fcs)
        
        self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, prem, hypo):

        #prem = [prem sent len, batch size]
        #hypo = [hypo sent len, batch size]
        
        embedded_prem = self.embedding(prem)
        embedded_hypo = self.embedding(hypo)
        
        #embedded_premise = [prem sent len, batch size, embedding dim]
        #embedded_hypothesis = [hypo sent len, batch size, embedding dim]
        
        translated_prem = F.relu(self.translation(embedded_prem))
        translated_hypo = F.relu(self.translation(embedded_hypo))
        
        #embedded_premise = [prem sent len, batch size, hidden dim]
        #embedded_hypothesis = [hypo sent len, batch size, hidden dim]
        
        if self.encode_method == 'gru':
            
            outputs_prem, hidden_prem = self.rnn(translated_prem)
            outputs_hypo, hidden_hypo = self.rnn(translated_hypo)
            
            #outputs_x = [sent len, batch size, hid dim]
            #hidden_x = [1, batch size, hid dim]
            
            hidden_prem = hidden_prem.squeeze(0)
            hidden_hypo = hidden_hypo.squeeze(0)
            
            #hidden_x = [batch size, hid dim]
        
        else:
            
            outputs_prem, (hidden_prem, cell_prem) = self.rnn(translated_prem)
            outputs_hypo, (hidden_hypo, cell_hypo) = self.rnn(translated_hypo)
            
            #outputs_x = [sent len, batch size, hid dim]
            #hidden_x = [1, batch size, hid dim]
            #cell_x = [1, batch size, hid dim]
            
            hidden_prem = hidden_prem.squeeze(0)
            hidden_hypo = hidden_hypo.squeeze(0)
            
            #hidden_x = [batch size, hid dim]

        hidden = torch.cat((hidden_prem, hidden_hypo), dim=1)

        #hidden = [batch size, hid dim * 2]
            
        for fc in self.fcs:
            hidden = fc(hidden)
            hidden = F.relu(hidden)
            hidden = self.dropout(hidden)
        
        prediction = self.fc_out(hidden)
        
        #prediction = [batch size, output dim]
        
        return prediction