In [None]:
import logging 
import random
import torch 
import torch.random

import pandas as pd 

logging.basicConfig(level=logging.DEBUG)

## Download and read the data

In [None]:
!../utils/download-data

In [None]:
text_emotion = pd.read_csv("../data/text_emotion.csv")
text_emotion.info()

In [None]:
text_emotion.head()

## Split the dataset

In [None]:
text_emotion_test = text_emotion.sample(frac=0.2, axis=0, random_state=1979)
text_emotion_train = text_emotion.drop(text_emotion_test.index)

## Explore a few of the tweets

In [None]:
with pd.option_context('display.max_colwidth', None):
    random_tweets = [random.randint(0, text_emotion.shape[0]) for i in range(0, 10)]
    print(text_emotion.iloc[random_tweets]['content'])

## Let's extract the tokens from our dataset

In [None]:
def tokenize(text: str) -> list[str]:
    """Returns the input text as a sequence of tokens
    
    The input is tokenized at character level and returns each character
    in the order they appear in the input
    """
    return list(text) if text is not None else None

In [None]:
corpus = text_emotion_train['content'].str.cat(sep=' ')
tokens = tokenize(corpus)
tokens[:10]

## Define our dictionary

In [None]:
unique_tokens = sorted(list(set(tokens)))
alphabet_size = len(unique_tokens)
token_mappings = list(zip(*[((token, idx), (idx, token)) for idx, token in enumerate(unique_tokens)]))
idx_by_token = dict(token_mappings[0])
token_by_idx = dict(token_mappings[1])

In [None]:
# print the 'first' 10 entries in each token - id dictionary
print(f'Token by idx: {dict(list(token_by_idx.items())[:10])}...')
print(f'Idx by token: {dict(list(idx_by_token.items())[:10])}...')

## Probability Matrix

In [None]:
bigram_frequencies = torch.zeros(alphabet_size, alphabet_size, dtype=torch.int32)
bigram_frequencies

In [None]:
tokens[:10]

In [None]:
bigrams = list(zip(tokens, tokens[1:]))
for bigram in bigrams:
    bigram_frequencies[idx_by_token[bigram[0]], idx_by_token[bigram[1]]] += 1
bigram_frequencies

In [None]:
from collections import Counter

frequencies_by_bigram = Counter()
for bigram in bigrams:
    frequencies_by_bigram[bigram] += 1
frequencies_by_bigram

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
plt.imshow(bigram_frequencies)

In [None]:
plt.figure(figsize=(16, 16))
plt.imshow(bigram_frequencies, cmap='Blues')
for i in range(alphabet_size):
    for j in range(alphabet_size):
        chstr = token_by_idx[i] + token_by_idx[j]
        plt.text(j, i, chstr, ha='center', va='bottom', color='gray')
        # plt.text(j, i, bigram_frequencies[i, j].item(), ha='center', va='top', color='gray')
plt.axis('off')

In [None]:
def encode(tokens: list[str]) -> torch.Tensor:
    """Returns the integer encoding of each token in the input sequence
    
    Each tokens index will simply be the idx of the token in the corpus' dictionary
    """
    encoded_tokens = [idx_by_token[token] for token in tokens]
    return torch.tensor(encoded_tokens,dtype=torch.uint8)

In [None]:
def decode(tokens: torch.Tensor) -> list[str]:
    """Returns the corresponding token for each encoded token in the input sequence"""
    if len(tokens.size()) > 1:
        tokens = tokens.flatten()
    tokens = list(tokens)
    return [token_by_idx[token.item()] for token in tokens] 

### An example of decoding an encoded token sequence using our dictionary

In [None]:
enc = torch.tensor([[38, 67, 74, 74, 77, 1], [82, 70, 67, 80, 67, 14]], dtype=torch.uint8)
enc

In [None]:
decode(enc)

In [None]:
"".join(decode(enc))

## Create our sampler

In [None]:
def sample_training_batch(text:str, context_size:int=32, batch_size:int=8) -> (torch.Tensor, torch.Tensor):
    """Returns a random training batch from the specified text"""
    text_size = len(text)
    logging.info(f'Text corpus size: {text_size}')
    max_sampling_index = text_size - context_size - 1
    logging.info(f'Upper index limit for sampling: {max_sampling_index}')
    sample_indices = torch.randint(max_sampling_index, (batch_size,), dtype=torch.int32)
    sample_batch = torch.zeros((batch_size, context_size), dtype=torch.uint8)
    for ix in range(batch_size):
        sample_batch[ix] = encode(tokenize(text[sample_indices[ix]: sample_indices[ix] + context_size]))
    return sample_batch    

In [None]:
sample_training_batch(corpus)

## Create the embedding matrix

In [None]:
n_chars = len(char_dict)
embed = torch.rand(n_chars, n_chars)
embed

In [None]:
batch_size = 8
context_length = 32

In [None]:
def get_random_training_batch():
    
    
    

In [None]:
import random

dataset_size = text_emotion.shape[0]

In [None]:
short_jokes['train'][0]

In [None]:
short_jokes['train'][10]['text']

In [None]:
dataset_unique_chars = list(set(' '.join([joke['text'] for joke in short_jokes['train']])))

In [None]:
ix_by_char = {char: ix for ix, char in enumerate(dataset_unique_chars)}

In [None]:
class Tokenizer:
    @staticmethod
    def tokenize(text: str) -> list[int]:
        return [ix_by_char[char] for char in list(text)]

## Creating The Training Data

In [None]:
def to_training_sequence(text: str) -> list[tuple[list[str], str]]:
    """Convert the text input to a sequence of training instances
    
    Each training instance consists of a sequence of zero or more input 
    characters and a single character as the target. Each character of 
    the input string is used as the target character in order with all
    characters to the left being the input characters. 
    
    >>> to_training_sequence('coyote')
    [([], 'c'),
     (['c'], 'o'),
     (['c', 'o'], 'y'),
     (['c', 'o', 'y'], 'o'),
     (['c', 'o', 'y', 'o'], 't'),
     (['c', 'o', 'y', 'o', 't'], 'e'),
     (['c', 'o', 'y', 'o', 't', 'e'], '')]
    """
    logging.debug(f'Generating training examples from the input "{text}"')
    text_chars = list(text)
    total_text_chars = len(text_chars)
    train_examples = [None] * (total_text_chars + 1)
    for ix in range(total_text_chars + 1):
        x = text_chars[:ix]
        y = "" if ix == total_text_chars else text_chars[ix]
        example = (x, y)
        train_examples[ix] = example
    logging.debug(f'Training examples successfully generated: {train_examples}')
    return train_examples

In [None]:
to_training_sequence('coyote')