In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [128]:
def generate_square_subsequent_mask(sz):
        r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
        """
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [130]:
class TransformerModel(nn.Module):
    """
    
    """
    def __init__(self, d_model, nhead=2, num_decoder_layers=2, num_encoder_layers=2, ntokens=5000):
        super(TransformerModel, self).__init__()
        
        self.num_encoder_layers = num_encoder_layers
        self.num_decoder_layers = num_decoder_layers
        self.nhead = nhead
        self.d_model = d_model
        self.ntokens = ntokens
        self.emb = nn.Embedding(self.ntokens, self.d_model)
        self.posenc = PositionalEncoding(d_model, 0.5)
        self.transformer = nn.Transformer(d_model=self.d_model, nhead=self.nhead,
                                          num_decoder_layers=self.num_decoder_layers,
                                          num_encoder_layers=self.num_encoder_layers)
        self.linear = nn.Linear(self.d_model, self.ntokens)
        
    def forward(self, x, tgt):
        x = self.emb(x) #* math.sqrt(self.d_model)
        tgt = self.emb(tgt)
        #print(x.shape, tgt.shape)
        x = self.posenc(x)
       # print(x)
        x = self.transformer(x, tgt, tgt_mask=generate_square_subsequent_mask(x.shape[0]), src_mask=generate_square_subsequent_mask(x.shape[0]))
        #return x
        return nn.functional.log_softmax(self.linear(x))
        
        
        

### This Part is from https://pytorch.org/tutorials/beginner/transformer_tutorial.html

In [131]:
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}

print(vocab)

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
{'so', 'To', 'thine', 'it', 'old,', "youth's", 'If', 'much', 'lies,', 'when', 'Were', 'now,', 'small', 'his', 'count,', 'days;', 'an', 'of', 'sum', 'winters', 'in', 'new', 'answer', 'thou', 'Then', 'brow,', 'thy', 'mine', "totter'd", 'the', 'all-eating', 'see', 'lusty', "feel'st", 'old', 'were', 'by', 'This', 'cold.', 'couldst', 'Where', 'make', 'to', 'where', 'trenches', 'more', "excuse,'", 'thriftless', 'thine!', 'use,', 'gazed', 'warm', 'Shall', 'praise.', "beauty's", 'asked,', 'my', 'When', 'besiege', 'field,', 'a', 'made', "'This", 'fair', 'be', 'beauty', 'all', 'How', 'forty', 'proud', 'shame,', 'treasure', 'livery', 'on', 'dig', 'art', 'weed', 'child', 'sunken', 'praise', 'eyes,', 'succession', 'And', 'Thy', "deserv'd", 'shall', 'being', 'own', 'Proving', 'and', 'Will', 'worth', 'held:', 'within', 'blood', 'deep', 'say,'}


In [108]:
import torchtext
from torchtext.data.utils import get_tokenizer

TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)

train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 16
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

print(test_txt)

<torchtext.datasets.language_modeling.WikiText2 object at 0x00000272540C1388>


In [109]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [110]:
ntokens = len(TEXT.vocab.stoi)
ntokens

28785

In [112]:
train_data.size()

torch.Size([130419, 16])

In [132]:
model = TransformerModel(d_model=50, nhead=1, num_decoder_layers=1, num_encoder_layers=1, ntokens=ntokens)

In [None]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [125]:
import time
def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data, data)
        print(i)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data, data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [None]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()



0
35
70
105
140
175
210
245
280
315
350
385
420
455
490
525
560
595
630
665
700
735
770
805
840
875
910
945
980
1015
1050
1085
1120
1155
1190
1225
1260
1295
1330
1365
1400
1435
1470
1505
1540
1575
1610
1645
1680
1715
1750
1785
1820
1855
1890
1925
1960
1995
2030
2065
2100
2135
2170
2205
2240
2275
2310
2345
2380
2415
2450
2485
2520
2555
2590
2625
2660
2695
2730
2765
2800
2835
2870
2905
2940
2975
3010
3045
3080
3115
3150
3185
3220
3255
3290
3325
3360
3395
3430
3465
3500
3535
3570
3605
3640
3675
3710
3745
3780
3815
3850
3885
3920
3955
3990
4025
4060
4095
4130
4165
4200
4235
4270
4305
4340
4375
4410
4445
4480
4515
4550
4585
4620
4655
4690
4725
4760
4795
4830
4865
4900
4935
4970
5005
5040
5075
5110
5145
5180
5215
5250
5285
5320
5355
5390
5425
5460
5495
5530
5565
5600
5635
5670
5705
5740
5775
5810
5845
5880
5915
5950
5985
6020
6055
6090
6125
6160
6195
6230
6265
6300
6335
6370
6405
6440
6475
6510
6545
6580
6615
6650
6685
6720
6755
6790
6825
6860
6895
6930
6965
7000
| epoch   1 |   200/ 3726 ba

In [126]:
test_loss = evaluate(model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)



KeyboardInterrupt: 