In [1]:
import pandas as pd
import torch
import time
import copy

Selecting device for calculations

In [2]:
dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [3]:
dev

device(type='cuda')

Reading data

In [4]:
df = pd.read_csv('./data.csv')
df.head(15)

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,0,10368,35,29,"Lisa Simpson: Maggie, look. What's that?",235000,True,9,5.0,Lisa Simpson,Simpson Home,"Maggie, look. What's that?",maggie look whats that,4.0
1,1,10369,35,30,Lisa Simpson: Lee-mur. Lee-mur.,237000,True,9,5.0,Lisa Simpson,Simpson Home,Lee-mur. Lee-mur.,lee-mur lee-mur,2.0
2,2,10370,35,31,Lisa Simpson: Zee-boo. Zee-boo.,239000,True,9,5.0,Lisa Simpson,Simpson Home,Zee-boo. Zee-boo.,zee-boo zee-boo,2.0
3,3,10372,35,33,Lisa Simpson: I'm trying to teach Maggie that ...,245000,True,9,5.0,Lisa Simpson,Simpson Home,I'm trying to teach Maggie that nature doesn't...,im trying to teach maggie that nature doesnt e...,24.0
4,4,10374,35,35,"Lisa Simpson: It's like an ox, only it has a h...",254000,True,9,5.0,Lisa Simpson,Simpson Home,"It's like an ox, only it has a hump and a dewl...",its like an ox only it has a hump and a dewlap...,18.0
5,5,10378,35,39,"Lisa Simpson: You know his blood type, how rom...",266000,True,9,5.0,Lisa Simpson,Simpson Home,"You know his blood type, how romantic.",you know his blood type how romantic,7.0
6,6,10380,35,41,"Lisa Simpson: Oh, yeah? What's my shoe size?",271000,True,9,5.0,Lisa Simpson,Simpson Home,"Oh, yeah? What's my shoe size?",oh yeah whats my shoe size,6.0
7,7,10386,35,47,Lisa Simpson: Ring.,276000,True,9,5.0,Lisa Simpson,Simpson Home,Ring.,ring,1.0
8,8,10448,35,109,"Lisa Simpson: Yes, Dad.",514000,True,9,5.0,Lisa Simpson,Simpson Home,"Yes, Dad.",yes dad,2.0
9,9,10528,35,189,"Lisa Simpson: Ooh, look, Maggie, what is that?...",861000,True,9,5.0,Lisa Simpson,Simpson Home,"Ooh, look, Maggie, what is that? Do-dec-ah-edr...",ooh look maggie what is that do-dec-ah-edron d...,8.0


In [5]:
phrases = df['normalized_text'].tolist()
phrases[:10]

['maggie look whats that',
 'lee-mur lee-mur',
 'zee-boo zee-boo',
 'im trying to teach maggie that nature doesnt end with the barnyard i want her to have all the advantages that i didnt have',
 'its like an ox only it has a hump and a dewlap hump and dew-lap hump and dew-lap',
 'you know his blood type how romantic',
 'oh yeah whats my shoe size',
 'ring',
 'yes dad',
 'ooh look maggie what is that do-dec-ah-edron dodecahedron']

Converting phrases to arrays of chars (we are going to use char as a token)

In [6]:
plain_text = [[c for c in ph] for ph in phrases if type(ph) is str]

In [7]:
plain_text[1]

['l', 'e', 'e', '-', 'm', 'u', 'r', ' ', 'l', 'e', 'e', '-', 'm', 'u', 'r']

Defining vocabluary and token to index, index to token converts

In [8]:
CHARS = list('abcdefghijklmnopqrstuvwxyz!?.-_ ')
INDEX_TO_CHAR = ['none'] + [c for c in CHARS]
CHAR_TO_INDEX = { c: i for i, c in enumerate(INDEX_TO_CHAR) }

In [9]:
len(INDEX_TO_CHAR)

33

In [10]:
INDEX_TO_CHAR

['none',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '!',
 '?',
 '.',
 '-',
 '_',
 ' ']

In [11]:
CHAR_TO_INDEX

{'none': 0,
 'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '!': 27,
 '?': 28,
 '.': 29,
 '-': 30,
 '_': 31,
 ' ': 32}

Defining Caesar encoding algorithm and using it for plain text encoding

In [12]:
def caesar_encode(text, k = 2):
    encoded_text = copy.deepcopy(text)

    for i, sentence in enumerate(text):
        for j, c in enumerate(sentence):
            encoded_text[i][j] = INDEX_TO_CHAR[(CHAR_TO_INDEX.get(c, CHAR_TO_INDEX['none']) + k) % len(INDEX_TO_CHAR)]

    return encoded_text

In [13]:
encoded_text = caesar_encode(plain_text)

In [14]:
plain_text[0]

['m',
 'a',
 'g',
 'g',
 'i',
 'e',
 ' ',
 'l',
 'o',
 'o',
 'k',
 ' ',
 'w',
 'h',
 'a',
 't',
 's',
 ' ',
 't',
 'h',
 'a',
 't']

In [15]:
encoded_text[0]

['o',
 'c',
 'i',
 'i',
 'k',
 'g',
 'a',
 'n',
 'q',
 'q',
 'm',
 'a',
 'y',
 'j',
 'c',
 'v',
 'u',
 'a',
 'v',
 'j',
 'c',
 'v']

Vectorizing input (encoded text) and output (plain text) texts

In [16]:
def vectorize_text(text):
    MAX_LEN = 50
    v = torch.zeros((len(text), MAX_LEN), dtype=int).to(dev)

    for i in range(len(text)):
        for j, c in enumerate(text[i]):
            if j >= MAX_LEN:
                break
            v[i, j] = CHAR_TO_INDEX.get(c, CHAR_TO_INDEX['none'])

    return v

In [17]:
X = vectorize_text(encoded_text)
Y = vectorize_text(plain_text)

In [18]:
X[0:2]

tensor([[15,  3,  9,  9, 11,  7,  1, 14, 17, 17, 13,  1, 25, 10,  3, 22, 21,  1,
         22, 10,  3, 22,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [14,  7,  7, 32, 15, 23, 20,  1, 14,  7,  7, 32, 15, 23, 20,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
       device='cuda:0')

In [19]:
Y[0:2]

tensor([[13,  1,  7,  7,  9,  5, 32, 12, 15, 15, 11, 32, 23,  8,  1, 20, 19, 32,
         20,  8,  1, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [12,  5,  5, 30, 13, 21, 18, 32, 12,  5,  5, 30, 13, 21, 18,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
       device='cuda:0')

Defining decoding neural network

In [20]:
class CaesarDecodingNetwork(torch.nn.Module):
    def __init__(self, vocabluary_size, embedding_size, rnn_size) -> None:
        super(CaesarDecodingNetwork, self).__init__()

        self.embedding = torch.nn.Embedding(vocabluary_size, embedding_size).to(dev)

        self.rnn = torch.nn.RNN(embedding_size, rnn_size, batch_first=True).to(dev)

        self.fcnn = torch.nn.Sequential(
            torch.nn.Tanh(),
            torch.nn.Linear(rnn_size, vocabluary_size),
            torch.nn.ReLU()
        ).to(dev)

    def forward(self, sentences):
        embeddings = self.embedding(sentences)
        o, _ = self.rnn(embeddings)
        return self.fcnn(o)

Creating and training model object

In [21]:
model = CaesarDecodingNetwork(len(INDEX_TO_CHAR), 40, 50)

In [22]:
criterion = torch.nn.CrossEntropyLoss().to(dev)
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [23]:
def train_caesar_decoder(model, X, Y, n_epochs, n_batch_size, n_epochs_to_print):
    for epoch in range(n_epochs):
        start = time.time()
        train_loss = 0.
        train_passed = 0

        for i in range(int(len(X) / n_batch_size)):
            X_batch = X[i*n_batch_size: (i+1) * n_batch_size]
            Y_batch = Y[i*n_batch_size: (i+1) * n_batch_size].flatten()

            optimizer.zero_grad()

            answers = model.forward(X_batch)
            answers = answers.view(-1, len(INDEX_TO_CHAR))
            loss = criterion(answers, Y_batch)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_passed += 1
        
        if epoch % n_epochs_to_print == 0:
            print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(epoch, time.time() - start, train_loss / train_passed))

In [24]:
train_caesar_decoder(model, X, Y, 100, 100, 10)

Epoch 0. Time: 0.377, Train loss: 0.117
Epoch 10. Time: 0.173, Train loss: 0.000
Epoch 20. Time: 0.178, Train loss: 0.000
Epoch 30. Time: 0.181, Train loss: 0.000
Epoch 40. Time: 0.181, Train loss: 0.000
Epoch 50. Time: 0.167, Train loss: 0.000
Epoch 60. Time: 0.177, Train loss: 0.000
Epoch 70. Time: 0.191, Train loss: 0.000
Epoch 80. Time: 0.179, Train loss: 0.000
Epoch 90. Time: 0.180, Train loss: 0.000


Defining util to decode text using provided decoder model

In [25]:
def devectorize_text(text):
    devectorized_text = []
    for phrase in text:
        devectorized_text.append([INDEX_TO_CHAR[v.argmax()] for v in phrase])

    return devectorized_text

In [33]:
def decode_text(decoder_model, encoded_text):
    vectorized_encoded_text = vectorize_text(encoded_text)
    vectorized_decoded_text = decoder_model(vectorized_encoded_text)
    devectorized_decoded_text = devectorize_text(vectorized_decoded_text)
    return [phrase[:phrase.index('none')] if 'none' in phrase else phrase for phrase in devectorized_decoded_text]

In [34]:
decode_text(model, encoded_text[0:2])[1]

['l', 'e', 'e', '-', 'm', 'u', 'r', ' ', 'l', 'e', 'e', '-', 'm', 'u', 'r']

Estimating quality of our model calculating average cosin similarity between decoded and plain texts

In [36]:
decoded_text = decode_text(model, encoded_text)

In [37]:
decoded_text

[['m',
  'a',
  'g',
  'g',
  'i',
  'e',
  ' ',
  'l',
  'o',
  'o',
  'k',
  ' ',
  'w',
  'h',
  'a',
  't',
  's',
  ' ',
  't',
  'h',
  'a',
  't'],
 ['l', 'e', 'e', '-', 'm', 'u', 'r', ' ', 'l', 'e', 'e', '-', 'm', 'u', 'r'],
 ['z', 'e', 'e', '-', 'b', 'o', 'o', ' ', 'z', 'e', 'e', '-', 'b', 'o', 'o'],
 ['i',
  'm',
  ' ',
  't',
  'r',
  'y',
  'i',
  'n',
  'g',
  ' ',
  't',
  'o',
  ' ',
  't',
  'e',
  'a',
  'c',
  'h',
  ' ',
  'm',
  'a',
  'g',
  'g',
  'i',
  'e',
  ' ',
  't',
  'h',
  'a',
  't',
  ' ',
  'n',
  'a',
  't',
  'u',
  'r',
  'e',
  ' ',
  'd',
  'o',
  'e',
  's',
  'n',
  't',
  ' ',
  'e',
  'n',
  'd',
  ' ',
  'w'],
 ['i',
  't',
  's',
  ' ',
  'l',
  'i',
  'k',
  'e',
  ' ',
  'a',
  'n',
  ' ',
  'o',
  'x',
  ' ',
  'o',
  'n',
  'l',
  'y',
  ' ',
  'i',
  't',
  ' ',
  'h',
  'a',
  's',
  ' ',
  'a',
  ' ',
  'h',
  'u',
  'm',
  'p',
  ' ',
  'a',
  'n',
  'd',
  ' ',
  'a',
  ' ',
  'd',
  'e',
  'w',
  'l',
  'a',
  'p',
  ' ',
  'h',
  

In [38]:
vectorized_plain_text = vectorize_text(plain_text)
vectorized_decoded_text = vectorize_text(decoded_text)

In [39]:
similarity_scores = torch.nn.functional.cosine_similarity(
        torch.tensor(vectorized_plain_text, dtype=torch.float32), torch.tensor(vectorized_decoded_text, dtype=torch.float32)
    )

  torch.tensor(vectorized_plain_text, dtype=torch.float32), torch.tensor(vectorized_decoded_text, dtype=torch.float32)


In [40]:
similarity_scores.mean().item()

0.9978144764900208