In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace

import pandas as pd

flags = Namespace(
    train_file='oliver.txt',
    seq_size=40,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

# Load Tweets

In [2]:
tweets_path = '/home/ben/data/tweets/training_set_tweets.txt'
OUTPUT_DIR = '/home/ben/data/tweets/'

In [3]:
train_df = pd.read_json(OUTPUT_DIR + 'train.json')
valid_df = pd.read_json(OUTPUT_DIR + 'valid.json')
test_df = pd.read_json(OUTPUT_DIR + 'test.json')

In [4]:
text = train_df.text_no_tags.to_list()
hashtags = train_df.hashtags_text.to_list()

# Make vocab

In [5]:
test_words = ' '.join(sum([],test_df.text_no_tags.to_list() + test_df.hashtags_text.to_list())).split()

In [6]:
all_words = ' '.join(sum([],text+hashtags)).split() + test_words

In [7]:
word_counts = Counter(all_words)
sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
vocab_to_int = {w: k for k, w in int_to_vocab.items()}
n_vocab = len(int_to_vocab)

print('Vocabulary size', n_vocab)

Vocabulary size 162193


In [8]:
def numericalize(sentence, vocab_to_int=vocab_to_int, seq_len=40):
    ints = [vocab_to_int[w] for w in sentence.split()]
    return np.concatenate([np.array(ints), np.zeros(tags_len-len(ints))])

In [9]:
sentence = 40
tags_len = 40

train_df['int_text'] = train_df.text_no_tags.apply(numericalize)
train_df['int_tags'] = train_df.hashtags_text.apply(numericalize)

In [10]:
test_df['int_text'] = test_df.text_no_tags.apply(numericalize)
test_df['int_tags'] = test_df.hashtags_text.apply(numericalize)

In [11]:
in_text = train_df.int_text.to_list()
out_text = train_df.int_tags.to_list()

In [12]:
in_text[0].shape

(40,)

In [13]:
def get_batches(in_text, out_text, batch_size, seq_size):
    for i in range(0, len(in_text), batch_size):
        yield in_text[i:i+batch_size], out_text[i:i+batch_size]

In [14]:
# Model

In [15]:
class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)
    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [16]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

net = RNNModule(n_vocab, flags.seq_size,
                flags.embedding_size, flags.lstm_size)
net = net.to(device)

criterion, optimizer = get_loss_and_train_op(net, 0.01)

iteration = 0

In [18]:
def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    
    in_text = words.text_no_tags.apply(numericalize).to_list()
    
    hashtags = words.hashtags_text.values
    text = words.text_no_tags.values
    
#     print('-'*20)
#     print('gold')
#     print(text)
#     print(hashtags)
#     print('-'*20)
    
    net.eval()        

    state_h, state_c = net.zero_state(5)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    
    ix = torch.tensor(in_text).long().to(device)
    
    output, (state_h, state_c) = net(ix, (state_h, state_c))
#     print(output[0])
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    
    
    for n, c in enumerate(choices[:len(text)]):
        try:
            print('-'*20)
            print('gold')
            print(text[n])
            print(hashtags[n])
            print('prediction:   ')
            print([int_to_vocab[w] for w in c])
            print('-'*20)
        except:
            pass

#     words.append(int_to_vocab[choice])
    
#     for _ in range(100):
#         ix = torch.tensor([[choice]]).to(device)
#         output, (state_h, state_c) = net(ix, (state_h, state_c))

#         _, top_ix = torch.topk(output[0], k=top_k)
#         choices = top_ix.tolist()
#         choice = np.random.choice(choices[0])
#         words.append(int_to_vocab[choice])

#     print(' '.join(words))

In [19]:
for e in range(50):
    batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
    state_h, state_c = net.zero_state(flags.batch_size)

    # Transfer data to GPU
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for x, y in batches:
        iteration += 1

        # Tell it we are in training mode
        net.train()

        # Reset all gradients
        optimizer.zero_grad()

        # Transfer data to GPU
        x = torch.tensor(x).long().to(device)
        y = torch.tensor(y).long().to(device)
#         print(x.shape)
        logits, (state_h, state_c) = net(x, (state_h, state_c))
        loss = criterion(logits.transpose(1, 2), y)

        state_h = state_h.detach()
        state_c = state_c.detach()

        loss_value = loss.item()

        # Perform back-propagation
#         loss.backward()

        # Update the network's parameters
        optimizer.step()
        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(
            net.parameters(), flags.gradients_norm)

        optimizer.step()
        if iteration % 100 == 0:
            print('Epoch: {}/{}'.format(e, 200),
                  'Iteration: {}'.format(iteration),
                  'Loss: {}'.format(loss_value))

        if iteration % 1000 == 0:
            predict(device, net, test_df.sample(n=5), n_vocab,
                    vocab_to_int, int_to_vocab, top_k=5)
            torch.save(net.state_dict(),
                       'model-{}.pth'.format(iteration))

Epoch: 0/200 Iteration: 100 Loss: 0.8700257539749146
Epoch: 0/200 Iteration: 200 Loss: 0.39841514825820923
Epoch: 0/200 Iteration: 300 Loss: 0.4980195462703705
Epoch: 0/200 Iteration: 400 Loss: 0.32982662320137024
Epoch: 0/200 Iteration: 500 Loss: 0.7507055401802063
Epoch: 0/200 Iteration: 600 Loss: 0.4355921745300293
Epoch: 0/200 Iteration: 700 Loss: 0.4667157530784607
Epoch: 0/200 Iteration: 800 Loss: 0.43873634934425354
Epoch: 0/200 Iteration: 900 Loss: 0.41016799211502075
Epoch: 0/200 Iteration: 1000 Loss: 0.3848070800304413
--------------------
gold
RT @PatriaPet barks to @DogTipper @KimPup @petxpert @rescuegal @DOGliving @bestbullysticks @SwtGeorgiaBrwn @Lovemy3goldens @Sylviasnote
#FF
prediction:   
['#outdoor', '#YouMightBeALiberal', '#uneedyourassbeat', '#producer', '#estespark']
--------------------
--------------------
gold
I cry at Formula 1 qualifying.
#iamsinglebecause
prediction:   
['#EasternStandard', 'the', '#TIGERWOODS', '#football', '#SOT']
--------------------
----

RuntimeError: Expected hidden[0] size (1, 1, 64), got (1, 16, 64)

In [None]:
torch.tensor([test_df.sample(n=5).hashtags_text.apply(numericalize).to_list()]).long().shape