In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.data.utils import get_tokenizer

import pandas as pd
import time

import torchtext
from torchtext.data.utils import get_tokenizer

In [2]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        if self.src_mask is None or self.src_mask.size(0) != len(src):
            device = src.device
            mask = self._generate_square_subsequent_mask(len(src)).to(device)
            self.src_mask = mask

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return output

``PositionalEncoding`` module injects some information about the
relative or absolute position of the tokens in the sequence. The
positional encodings have the same dimension as the embeddings so that
the two can be summed. Here, we use ``sine`` and ``cosine`` functions of
different frequencies.




In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

Load and batch data
-------------------




The training process uses Wikitext-2 dataset from ``torchtext``. The
vocab object is built based on the train dataset and is used to numericalize
tokens into tensors. Starting from sequential data, the ``batchify()``
function arranges the dataset into columns, trimming off any tokens remaining
after the data has been divided into batches of size ``batch_size``.
For instance, with the alphabet as the sequence (total length of 26)
and a batch size of 4, we would divide the alphabet into 4 sequences of
length 6:

\begin{align}\begin{bmatrix}
  \text{A} & \text{B} & \text{C} & \ldots & \text{X} & \text{Y} & \text{Z}
  \end{bmatrix}
  \Rightarrow
  \begin{bmatrix}
  \begin{bmatrix}\text{A} \\ \text{B} \\ \text{C} \\ \text{D} \\ \text{E} \\ \text{F}\end{bmatrix} &
  \begin{bmatrix}\text{G} \\ \text{H} \\ \text{I} \\ \text{J} \\ \text{K} \\ \text{L}\end{bmatrix} &
  \begin{bmatrix}\text{M} \\ \text{N} \\ \text{O} \\ \text{P} \\ \text{Q} \\ \text{R}\end{bmatrix} &
  \begin{bmatrix}\text{S} \\ \text{T} \\ \text{U} \\ \text{V} \\ \text{W} \\ \text{X}\end{bmatrix}
  \end{bmatrix}\end{align}

These columns are treated as independent by the model, which means that
the dependence of ``G`` and ``F`` can not be learned, but allows more
efficient batch processing.




# Load Tweets

In [4]:
tweets_path = '/home/ben/data/tweets/training_set_tweets.txt'
OUTPUT_DIR = '/home/ben/data/tweets/'

In [15]:
with open(tweets_path) as f:
    tweets_lines = f.readlines()

_data = []
for n,x in enumerate(tweets_lines):
    try:
        _data.append({'text':x.split('\t')[2]})
    except IndexError:
        pass

df = pd.DataFrame(_data)

df

# # Cut out hashtags

# hashtags = df.text.str.extractall(r'(\#\S*)')

# df['hashtags'] = hashtags.unstack().agg(list, axis=1)

# df.dropna(inplace=True, subset=['hashtags'])

# df.hashtags = df.hashtags.apply(lambda x : [h for h in x if type(h) == str])

# df['hashtags_text'] = df.hashtags.apply(lambda x: ' '.join(x))

# df['text_no_tags'] = df.apply(lambda x: ' '.join([w for w in x.text.split(' ') if w not in x.hashtags]), axis=1)

# df.head(10)

# train_split = 0.9
# train_size = int(len(df) * train_split)
# test_size = int(len(df)-train_size)
# train_df = df[:train_size]
# valid_df = df[train_size:-test_size]
# test_df = df[-test_size:]

# train_df.to_json(OUTPUT_DIR + 'train.json')
# valid_df.to_json(OUTPUT_DIR + 'valid.json')
# test_df.to_json(OUTPUT_DIR + 'test.json')

Unnamed: 0,text
0,@thediscovietnam coo. thanks. just dropped yo...
1,@thediscovietnam shit it ain't lettin me DM yo...
2,"@thediscovietnam hey cody, quick question...ca..."
3,@smokinvinyl dang. you need anything? I got ...
4,"maybe i'm late in the game on this one, but th..."
...,...
3747636,what bees make milk?? ...
3747637,boredd as tits.
3747638,"oh god heroes tonight, ... http://lnk.ms/0ShgX"
3747639,tits i'm bored


In [6]:
train_df = pd.read_json(OUTPUT_DIR+'train.json')

In [7]:
train_df

Unnamed: 0,text,hashtags,hashtags_text,text_no_tags
32,this afghanistan situation is not good. come o...,[#red],#red,this afghanistan situation is not good. come o...
101,@courosa Here's 2 wishing this had been stream...,[#edmedia],#edmedia,@courosa Here's 2 wishing this had been stream...
104,I hope you all enjoy #edmedia day 4. I wish I ...,[#edmedia],#edmedia,I hope you all enjoy day 4. I wish I could be ...
105,Have u been 2 our site yet? Yeah - It's not fl...,"[#highered, #film, #TEACHact, #moodle, #sakai]",#highered #film #TEACHact #moodle #sakai,Have u been 2 our site yet? Yeah - It's not fl...
106,Have you been to our website yet? Yeah - It's ...,"[#highered, #film, #TEACHact, #moodle, #sakai]",#highered #film #TEACHact #moodle #sakai,Have you been to our website yet? Yeah - It's ...
...,...,...,...,...
3220150,@Ronstormer Booo. I gave you 20 bucks to make ...,[#istaysupercooltoocool],#istaysupercooltoocool,@Ronstormer Booo. I gave you 20 bucks to make ...
3220230,"Music Monday: Review: Wheedle&#39;s Groove, Ke...",[#39;s],#39;s,"Music Monday: Review: Wheedle&#39;s Groove, Ke..."
3220234,"What&#39;s Happenin&#39;: September 16 - 30, 2...","[#39;s, #39;:]",#39;s #39;:,"What&#39;s Happenin&#39;: September 16 - 30, 2..."
3220382,#florida #politics - Kenric Ward: The 'need' f...,"[#florida, #politics]",#florida #politics,- Kenric Ward: The 'need' for common ground in...


# Torch Text Dataset

In [12]:
# Fields
TEXT = torchtext.data.Field(tokenize='spacy',
#                             init_token='<sos>',
#                             eos_token='<eos>'
                           )
# HASHTAGS = torchtext.data.Field(tokenize=list,
#                             lower=False,)
fields = {'text_no_tags': ('text', TEXT), 'hashtags_text': ('label', TEXT)}

In [13]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = OUTPUT_DIR,
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        test = 'test.json',
                                        format = 'json',
                                        fields = fields
)

In [14]:
# train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
# HASHTAGS.build_vocab(train_data, 
#                  max_size = 25_000, 
#                  vectors = "glove.twitter.27B.200d", 
#                  unk_init = torch.Tensor.normal_)
TEXT.build_vocab(train_data, 
#                  max_size = 25_000, 
#                  vectors = "glove.twitter.27B.200d", 
#                  unk_init = torch.Tensor.normal_
                )

TypeError: '<' not supported between instances of 'str' and 'int'

In [31]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # BATCH_SIZE = 50

# # train_iterator, valid_iterator, test_iterator = data.BPTTIterator.splits(
# #     (train_data, valid_data, test_data), 
# #     batch_size = BATCH_SIZE,
# #     sort_key=lambda x: len(x.text),
# #     sort_within_batch = True,
# #     device = device,
# #     bptt_len=10)

In [14]:
def batchify(data, bsz):
    X = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = X.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    X = X.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    X = X.view(bsz, -1).t().contiguous()
    
    
    y = TEXT.numericalize([data.examples[0].label])
    # Divide the dataset into bsz parts.
    nbatch = y.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    y = y.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    y = y.view(bsz, -1).t().contiguous()
    
    return (X.to(device),y.to(device))

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_data, batch_size)
val_data = batchify(valid_data, eval_batch_size)
test_data = batchify(test_data, eval_batch_size)


AttributeError: 'Field' object has no attribute 'vocab'

Functions to generate input and target sequence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~




``get_batch()`` function generates the input and target sequence for
the transformer model. It subdivides the source data into chunks of
length ``bptt``. For the language modeling task, the model needs the
following words as ``Target``. For example, with a ``bptt`` value of 2,
we’d get the following two Variables for ``i`` = 0:

![](../_static/img/transformer_input_target.png)


It should be noted that the chunks are along dimension 0, consistent
with the ``S`` dimension in the Transformer model. The batch dimension
``N`` is along dimension 1.




In [39]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source[0]) - 1 - i)
    data = source[0][i:i+seq_len]
    target = source[1][i:i+seq_len].view(-1)
#     target = source[i+1:i+1+seq_len].view(-1)
    return data, target

Initiate an instance
--------------------




The model is set up with the hyperparameter below. The vocab size is
equal to the length of the vocab object.




In [45]:
ntokens = len(TEXT.vocab.stoi) # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models+ len(HASHTAGS.vocab.stoi)
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

Run the model
-------------




`CrossEntropyLoss <https://pytorch.org/docs/master/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss>`__
is applied to track the loss and
`SGD <https://pytorch.org/docs/master/optim.html?highlight=sgd#torch.optim.SGD>`__
implements stochastic gradient descent method as the optimizer. The initial
learning rate is set to 5.0. `StepLR <https://pytorch.org/docs/master/optim.html?highlight=steplr#torch.optim.lr_scheduler.StepLR>`__ is
applied to adjust the learn rate through epochs. During the
training, we use
`nn.utils.clip_grad_norm\_ <https://pytorch.org/docs/master/nn.html?highlight=nn%20utils%20clip_grad_norm#torch.nn.utils.clip_grad_norm_>`__
function to scale all the gradient together to prevent exploding.




In [46]:
criterion = nn.CrossEntropyLoss()
lr = 5.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train():
    model.train() # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data[0].size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # Turn on the evaluation mode
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source[0].size(0) - 1, bptt):
            data, targets = get_batch(data_source[0], i)
            output = eval_model(data)
            print(output)
            print(targets)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source[0]) - 1)

Loop over epochs. Save the model if the validation loss is the best
we've seen so far. Adjust the learning rate after each epoch.



In [47]:
best_val_loss = float("inf")
epochs = 3 # The number of epochs
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  3.30s | valid loss -0.00 | valid ppl     1.00
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  2.90s | valid loss -0.00 | valid ppl     1.00
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  2.90s | valid loss -0.00 | valid ppl     1.00
-----------------------------------------------------------------------------------------


In [48]:
model.eval() # Turn on the evaluation mode
total_loss = 0.
ntokens = len(TEXT.vocab.stoi)

In [49]:
type(train_data[0])

torch.Tensor

In [51]:
with torch.no_grad():
    for i in range(0, test_data[0].size(0) - 1, bptt):
        print(i)
        data, targets = get_batch(test_data[0], i)
        output = model(data)
        print(targets)
#         output_flat = output.view(-1, ntokens)
#         total_loss += len(data) * criterion(output_flat, targets).item()
# # return total_loss / (len(data_source) - 1)

0
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')
35


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0, 100] because the unspecified dimension size -1 can be any value and is ambiguous

Evaluate the model with the test dataset
-------------------------------------

Apply the best model to check the result with the test dataset.



In [245]:
test_loss = evaluate(best_model, test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)



OverflowError: math range error