# Character-based language model with AllenNLP

In [None]:
import allennlp
print(allennlp.__version__)

In [None]:
# standard libraries
import numpy as np
import pandas as pd

# torch libraries
import torch
import torch.optim as optim

# standard libraries
from matplotlib import pyplot as plt
from collections import Counter
from typing import Dict, List, Tuple, Set, Iterable

# AllenNLP libraries
from allennlp.common.file_utils import cached_path
from allennlp.common.util import START_SYMBOL, END_SYMBOL
from allennlp.data.fields import TextField
from allennlp.data.instance import Instance
from allennlp.data.dataloader import DataLoader
from allennlp.data.dataset_readers.dataset_reader import AllennlpDataset
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token, CharacterTokenizer
from allennlp.data.vocabulary import Vocabulary, DEFAULT_PADDING_TOKEN
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.trainer import Trainer

In [None]:
full_data = pd.read_csv('/home/adelard/ml/manning/dataset/stackexchange_full_data_tokenized.csv.gz',
                  compression='gzip').sample(frac = 1, random_state = 42).reset_index(drop = True)

In [None]:
# split string into a list of characters
text = ''.join(full_data.text.values).lower()

In [None]:
characters = [char for char in text]
unique_characters = np.unique(characters)
print(len(unique_characters))
print()
print(unique_characters[:100])

In [None]:
# define the vocabulary by keeping the most common characters in the dataset
char_count = Counter(characters)
print(len(char_count.most_common()))
print(char_count.most_common())

In [None]:
# MAX_VOCAB_SIZE = 50
MAX_VOCAB_SIZE = 50
chars_most_common = [char[0] for char in char_count.most_common(MAX_VOCAB_SIZE)]
chars_most_common.sort()

### Training data

In [None]:
# reducing the dataset 
POSTS_TYPE = 'title'
SAMPLE_COUNT = 10000

In [None]:
# subsample the full dataset
small_data = full_data[(full_data.category == POSTS_TYPE)].sample(SAMPLE_COUNT).reset_index(drop=True)
print("data shape: ", small_data.shape)
print(small_data.text.sample(5).values)

### AllenNLP structure
#### Instances
An instance composed of the input and output objects.
#### Model
Model uses a forward() method that takes tensor inputs and produces a dict of tensor outputs that includes the loss used to train the model.
#### Tokenizer and instances
The AllenNLP CharacterTokenizer is used to splitting the sentences into list of characters. 

In [None]:
tokenizer = CharacterTokenizer()

In [None]:
# train set is composed of tokenized sequences of the original text
train_data = small_data.text.apply(lambda txt: tokenizer.tokenize(txt.lower())).values

In [None]:
# function to take a list of tokens and an indexer as input and returns 
# an instance composed of the input and output tokens.

def tokens_to_instance(toks: List[Token], token_indexers: Dict[str, TokenIndexer]):
    tokens = list(toks)
    tokens.insert(0, Token(START_SYMBOL))
    tokens.append(Token(END_SYMBOL))
    
    input_field = TextField(tokens[:-1], token_indexers)
    output_field = TextField(tokens[1:], token_indexers)
    return Instance({'input_tokens': input_field, 'output_tokens': output_field})

In [None]:
# The SingleIdTokenIndexer takes care of the mapping between the
# tokens and their unique index in the vocabulary. 
token_indexers = {'tokens': SingleIdTokenIndexer()}
instances = [tokens_to_instance(tokens, token_indexers) for tokens in train_data]

### Vocabulary

In [None]:
token_counts = {char: 1 for char in chars_most_common}
vocab = Vocabulary({'tokens': token_counts})

### Model

In [None]:
EMBEDDING_SIZE = 32
HIDDEN_SIZE = 256
BATCH_SIZE = 128

In [None]:
class RNNModel(Model):
    
    def __init__(self, embedder: TextFieldEmbedder,
                hidden_size: int,
                max_len: int,
                vocab: Vocabulary) -> None:
        super().__init__(vocab)
        
        self.embedder = embedder
        
        # initialize a seq2seq encodeer, LSTM
        self.rnn = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))
        
        self.hidden2out = torch.nn.Linear(in_features=self.rnn.get_output_dim(),
                                         out_features=vocab.get_vocab_size('tokens'))
        self.hidden_size = hidden_size
        self.max_len = max_len
        
    def forward(self, input_tokens, output_tokens):
        mask = get_text_field_mask(input_tokens)
        embeddings = self.embedder(input_tokens)
        rnn_hidden = self.rnn(embeddings, mask)
        out_logits = self.hidden2out(rnn_hidden)
        loss = sequence_cross_entropy_with_logits(out_logits, output_tokens['tokens'], mask)
        
        return {'loss': loss}
    
    def generate_text(self)-> Tuple[List[Token], torch.tensor]:
        
        start_symbol_index = self.vocab.get_token_index(START_SYMBOL, 'tokens')
        end_symbol_index = self.vocab.get_token_index(END_SYMBOL, 'tokens')
        padding_symbol_index = self.vocab.get_token_indexn(DEFAULT_PADDING_TOKEN, 'tokens')
        
        log_likelihood = 0.
        result_words = []
        state = (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))
        
        word_index = start_symbol_index
        
        for i in range(self.max_len):
            tokens = torch.tensor([[word_index]])
            
            embeddings = self.embedder({'tokens': tokens})
            output, state = self.rnn._module(embeddings, state)
            output = self.hidden2out(output)
            
            log_prob = torch.log_softmax(output[0, 0], dim=0)
            
            dist = torch.exp(log_prob)
            word_index = start_symbol_index
            
            while word_index in {start_symbol_index, padding_symbol_index}:
                word_index = torch.multinomial(dist, num_samples=1, replacement=False).item()
                
            log_likelihood += log_prob[word_index]
            
            if word_index == end_symbol_index:
                break
                
            token = Token(text-self.vocab.get_token_from_index(word_index, 'tokens'))
            words.append(token)
            
        return result_words, log_likelihood

In [None]:
# instantiate the embeddings, the rnn model, the optimizer
# the trainer
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), 
                            embedding_dim=EMBEDDING_SIZE)
embedder = BasicTextFieldEmbedder({"tokens": token_embedding})

model = RNNModel(embedder=embedder, hidden_size=HIDDEN_SIZE, max_len=80, vocab=vocab)

In [None]:
dataset = AllennlpDataset(instances, vocab)
loader = DataLoader(dataset, batch_size=BATCH_SIZE)
loader = next(iter(loader))
optimizer = optim.Adam(rnn_model.parameters(), lr=5.e-3)

In [None]:
trainer = Trainer(model=model,
                 optimizer=optimizer,
                 iterator=loader,
                 train_dataset=instances_tokens,
                 num_epochs=20)

In [None]:
def predict(txt: str, model: Model) -> float:
    tokenizer = CharacterTokenizer()
    tokens = tokenizer.tokenizer(txt)
    
    token_indexers = {'tokens': SingleIdTokenIndexer()}
    instance = tokens_to_instance(tokens, token_indexers)
    output = model.forward_on_instance(instance)
    print(output)

### Text generation

In [None]:
for _ in range(25):
    tokens, _ = model.generate()
    print(''.join(token.text for token in tokens))