In [2]:
import logging 
import torch 
import torch.random

import numpy as np
import pandas as pd 

from pathlib import Path 

logging.basicConfig(level=logging.DEBUG)

## Download and read the data

In [13]:
text_emotion_path = Path('../data/text_emotion.csv').resolve()
if not text_emotion_path.exists() or not text_emotion_path.is_file():
    logging.error(f'No dataset files could be found at the specified path: \'{text_emotion_path}\'')
    raise FileNotFoundError

INFO:root:No dataset files could be found at the specified path: '/Users/kadeem/Spaces/Projects/ANa/ana-core/data/textz_emotion.csv'


FileNotFoundError: 

In [3]:
logging.info(f'Loading the dataset from \'{text_emotion_path.resolve()}\'')
text_emotion = pd.read_csv("../data/text_emotion.csv")
text_emotion.info()

INFO:root:Loading the dataset from '/Users/kadeem/Spaces/Projects/ANa/ana-core/data/text_emotion.csv'


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   author     40000 non-null  object
 3   content    40000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [4]:
text_emotion.head()

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...


## Split the dataset

In [5]:
text_emotion_test = text_emotion.sample(frac=0.2, axis=0, random_state=1979)
text_emotion_train = text_emotion.drop(text_emotion_test.index)

## Explore a few of the tweets

In [6]:
with pd.option_context('display.max_colwidth', None):
    random_tweets = [random.randint(0, text_emotion.shape[0]) for i in range(0, 10)]
    print(text_emotion.iloc[random_tweets]['content'])

10339    @Bizfizz Damn damn and blast!  I'm at LMHR tomorrow and Sam out   Who is running it?  I could see if she could join next week?
29951                                                                         A shower feels so refreshing after a long day at the fair
2019                                                                                                    @mosapp just say no reeesee cup
9026                             So much for buying that awesome new phone from sony ericsson  - I am now Berry'd like everyone else...
34798                                                                                                          @esuh so what if i cried
20490                                                                                    @shadiya I hope you were feeling better today!
4264                                                                                                              job hunting. yay.....
38196                                           

## Let's extract the tokens from our dataset

In [7]:
def tokenize(text: str) -> list[str]:
    """Returns the input text as a sequence of tokens
    
    The input is tokenized at character level and returns each character
    in the order they appear in the input
    """
    return list(text) if text is not None else None

In [8]:
corpus = text_emotion_train['content'].str.cat(sep=' ')
tokens = tokenize(corpus)
tokens[:10]

['@', 't', 'i', 'f', 'f', 'a', 'n', 'y', 'l', 'u']

## Define our dictionary

In [9]:
unique_tokens = sorted(list(set(tokens)))
alphabet_size = len(unique_tokens)
token_mappings = list(zip(*[((token, idx), (idx, token)) for idx, token in enumerate(unique_tokens)]))
idx_by_token = dict(token_mappings[0])
token_by_idx = dict(token_mappings[1])

In [10]:
# print the 'first' 10 entries in each token - id dictionary
print(f'Token by idx: {dict(list(token_by_idx.items())[:10])}...')
print(f'Idx by token: {dict(list(idx_by_token.items())[:10])}...')

Token by idx: {0: '\t', 1: ' ', 2: '!', 3: '#', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')'}...
Idx by token: {'\t': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9}...


## Frequency Matrix

In [11]:
bigram_frequencies = torch.zeros(alphabet_size, alphabet_size, dtype=torch.int32)
bigram_frequencies

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.int32)

In [12]:
bigrams = list(zip(tokens, tokens[1:]))

In [13]:
def count_bigram_frequency(bigrams: list[tuple[str, str]]) -> torch.Tensor:
    """Returns a tensor containing the total occurrences of each bigram
    
    The dimensions of the returned tensor are equal in size to the alphabet
    size. The cell at (i,j) corresponds the number of occurrences of the 
    bigrams consisting of the token at the ith position in the alphabet
    followed by the token at the jth position"""
    bigram_frequencies = torch.zeros((alphabet_size, alphabet_size), dtype=torch.int32)
    for bigram in bigrams:
        bigram_frequencies[idx_by_token[bigram[0]], idx_by_token[bigram[1]]] += 1
    return bigram_frequencies

In [14]:
bigram_frequencies = count_bigram_frequency(bigrams)
bigram_frequencies

tensor([[    0,     0,     0,  ...,     0,     0,     0],
        [    0, 13755,   456,  ...,     0,     0,    63],
        [    0, 12050,  5195,  ...,     0,     0,     0],
        ...,
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,     0,     0,     0],
        [    0,     0,     0,  ...,   362,     0,     0]], dtype=torch.int32)

## Generate a short sequence

### Convert the frequencies to probabilities

In [15]:
bigram_probabilities = bigram_frequencies / bigram_frequencies.sum(1, keepdims=True)
bigram_probabilities

tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 3.1508e-02, 1.0445e-03,  ..., 0.0000e+00, 0.0000e+00,
         1.4431e-04],
        [0.0000e+00, 6.8210e-01, 2.9407e-01,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [16]:
max_sequence_length = 120 

In [17]:
def generate_text(max_length: int = 120) -> str:
    """Generates text until a full stop is encountered or until the max length is reached"""
    current_sequence_length = 0
    current_token = 10 # full stop
    current_sequence = ''
    while current_token != 14 and current_sequence_length <= max_sequence_length:
        current_token = torch.multinomial(bigram_probabilities[current_token], num_samples=1).item()
        current_sequence += token_by_idx[current_token]
        current_sequence_length += 1
    return current_sequence

In [18]:
for i in range(20):
    print(f'{i}: {generate_text()}')

0: isthag d lyo (w  gr2d t te ay ooot  ~! a w I dainnnn.
1: ng  omas gatit my Gut bap; sazicheelin d inclie lo thivink Pl toulledeth.
2:  hewad.
3: los.
4: .
5: rit lrbo D] #by.
6: peatPeJutha n'soextrso ck meer HAfe @Copiaveye Beeaser'tit yssouthe h * ha d s havis k.
7: biike, t, ies AheWowelichidaci we bucoollakathitoftee I os  i't opt**PMait  tht s ncllk ulleilyo @Zousakee we a bt neth a
8:  mowaghe helugut t me ha 2 the  tos itw it bh isichthepe8t;.
9:  Pisttazind ty an RUn ahmeplofu bslougethelt t, i he hine ankerastin ine Whes Yesslyo Noor n't USTo s ADa indar  lop:/ppi
10:  wexit d averark se ngitaty ARS ht me llon ugl I lahormare  he ithe woon lineed Is sove bed mee ouppiet s fot BLOWhxtoma 
11:  m! Wialde o DI @Bur the  2 h.
12:  fomoure, crl yedFrky blo incea s!! Fut hamies I  alld tuporenelin e tikemmo on fomast m thun tod  fat 2 ty ve CANonrip s
13:  ffupll youp t tussedin welloveratthowheecamye aicke ~ beenou Gutofufitoshoowen.
14: mo Wik thy-inonderebel ay ther ut.
15: ho