In [21]:
import logging 
import torch 
import torch.random

import numpy as np
import pandas as pd 

from pathlib import Path 

logging.basicConfig(level=logging.DEBUG)

DATASET_PATH = "../data/text_emotion.csv"
RANDOM_STATE = 42

TEST_SIZE = 0.2
SENTENCE_SEPARATOR = ". "

## Load the dataset 

In [2]:
# Show the head of the dataset or throw an error if the dataset file is not found
dataset_path = Path(DATASET_PATH).resolve()
if not dataset_path.exists() or not dataset_path.is_file():
    logging.error(f'No dataset file could be found at the specified path: \'{dataset_path}\'')
    raise FileNotFoundError
dataset = pd.read_csv(dataset_path)
dataset.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


In [3]:
test_dataset = dataset.sample(frac=TEST_SIZE, axis=0, random_state=RANDOM_STATE)
train_dataset = dataset.drop(test_dataset.index)
logging.info("Successfully split train and test datasets")
logging.info(f"# train instances: {len(train_dataset)}")
logging.info(f"# test instances: {len(test_dataset)}")

INFO:root:Successfully split train and test datasets
INFO:root:# train instances: 32000
INFO:root:# test instances: 8000


## Convert the dataset to a text corpus 

In [22]:
def to_corpus(dataset: pd.DataFrame) -> str:
    """Converts the training dataset to a corpus of text"""
    logging.debug(f"Converting dataset of shape {dataset['content'].shape} to corpus")
    return dataset['content'].str.cat(sep=SENTENCE_SEPARATOR)

In [25]:
train_corpus = to_corpus(train_dataset)
print(f"Sample of output corpus: '{train_corpus[:100]}'")

DEBUG:root:Converting dataset of shape (32000,) to corpus


Sample of output corpus: '@tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[. Funera'


## Tokenize the corpus

In [7]:
def tokenize(text: str) -> list[str]:
    """Returns the input text as a sequence of tokens
    
    The input is tokenized at character level and returns each character
    in the order they appear in the input
    """
    return list(text) if text is not None else None

In [8]:
corpus = text_emotion_train['content'].str.cat(sep=' ')
tokens = tokenize(corpus)
tokens[:10]

['@', 't', 'i', 'f', 'f', 'a', 'n', 'y', 'l', 'u']

## Define our dictionary

In [9]:
unique_tokens = sorted(list(set(tokens)))
alphabet_size = len(unique_tokens)
token_mappings = list(zip(*[((token, idx), (idx, token)) for idx, token in enumerate(unique_tokens)]))
idx_by_token = dict(token_mappings[0])
token_by_idx = dict(token_mappings[1])

In [10]:
# print the 'first' 10 entries in each token - id dictionary
print(f'Token by idx: {dict(list(token_by_idx.items())[:10])}...')
print(f'Idx by token: {dict(list(idx_by_token.items())[:10])}...')

Token by idx: {0: '\t', 1: ' ', 2: '!', 3: '#', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')'}...
Idx by token: {'\t': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9}...


## Frequency Matrix

In [11]:
bigram_frequencies = torch.zeros(alphabet_size, alphabet_size, dtype=torch.int32)
bigram_frequencies

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)

In [12]:
bigrams = list(zip(tokens, tokens[1:]))

In [13]:
def count_bigram_frequency(bigrams: list[tuple[str, str]]) -> torch.Tensor:
    """Returns a tensor containing the total occurrences of each bigram
    
    The dimensions of the returned tensor are equal in size to the alphabet
    size. The cell at (i,j) corresponds the number of occurrences of the 
    bigrams consisting of the token at the ith position in the alphabet
    followed by the token at the jth position"""
    bigram_frequencies = torch.zeros((alphabet_size, alphabet_size), dtype=torch.int32)
    for bigram in bigrams:
        bigram_frequencies[idx_by_token[bigram[0]], idx_by_token[bigram[1]]] += 1
    return bigram_frequencies

In [14]:
bigram_frequencies = count_bigram_frequency(bigrams)
bigram_frequencies

tensor([[    0,     0,     0,  ...,     0,     0,     0],
        [    0, 13755,   456,  ...,     0,     0,    63],
        [    0, 12050,  5195,  ...,     0,     0,     0],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,   362,     0,     0]], dtype=torch.int32)

## Generate a short sequence

### Convert the frequencies to probabilities

In [15]:
bigram_probabilities = bigram_frequencies / bigram_frequencies.sum(1, keepdims=True)
bigram_probabilities

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 3.1508e-02, 1.0445e-03,  ..., 0.0000e+00, 0.0000e+00,
         1.4431e-04],
        [0.0000e+00, 6.8210e-01, 2.9407e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [16]:
max_sequence_length = 120 

In [17]:
def generate_text(max_length: int = 120) -> str:
    """Generates text until a full stop is encountered or until the max length is reached"""
    current_sequence_length = 0
    current_token = 10 # full stop
    current_sequence = ''
    while current_token != 14 and current_sequence_length <= max_sequence_length:
        current_token = torch.multinomial(bigram_probabilities[current_token], num_samples=1).item()
        current_sequence += token_by_idx[current_token]
        current_sequence_length += 1
    return current_sequence

In [18]:
for i in range(20):
    print(f'{i}: {generate_text()}')

0: isthag d lyo (w  gr2d t te ay ooot  ~! a w I dainnnn.
1: ng  omas gatit my Gut bap; sazicheelin d inclie lo thivink Pl toulledeth.
2:  hewad.
3: los.
4: .
5: rit lrbo D] #by.
6: peatPeJutha n'soextrso ck meer HAfe @Copiaveye Beeaser'tit yssouthe h * ha d s havis k.
7: biike, t, ies AheWowelichidaci we bucoollakathitoftee I os  i't opt**PMait  tht s ncllk ulleilyo @Zousakee we a bt neth a
8:  mowaghe helugut t me ha 2 the  tos itw it bh isichthepe8t;.
9:  Pisttazind ty an RUn ahmeplofu bslougethelt t, i he hine ankerastin ine Whes Yesslyo Noor n't USTo s ADa indar  lop:/ppi
10:  wexit d averark se ngitaty ARS ht me llon ugl I lahormare  he ithe woon lineed Is sove bed mee ouppiet s fot BLOWhxtoma 
11:  m! Wialde o DI @Bur the  2 h.
12:  fomoure, crl yedFrky blo incea s!! Fut hamies I  alld tuporenelin e tikemmo on fomast m thun tod  fat 2 ty ve CANonrip s
13:  ffupll youp t tussedin welloveratthowheecamye aicke ~ beenou Gutofufitoshoowen.
14: mo Wik thy-inonderebel ay ther ut.
15: ho