In [1]:
import os
import pandas as pd
import torch
from torch import nn 
import time
import numpy as np 

In [2]:
def get_data_path(file_name):
    path_data = os.path.join(os.getcwd(), 'Data')
    return os.path.join(path_data, file_name)

In [3]:
simpson_data = pd.read_csv(get_data_path('data.csv')).iloc[:, 1:]

In [4]:
simpson_data.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,10368,35,29,"Lisa Simpson: Maggie, look. What's that?",235000,True,9,5.0,Lisa Simpson,Simpson Home,"Maggie, look. What's that?",maggie look whats that,4.0
1,10369,35,30,Lisa Simpson: Lee-mur. Lee-mur.,237000,True,9,5.0,Lisa Simpson,Simpson Home,Lee-mur. Lee-mur.,lee-mur lee-mur,2.0
2,10370,35,31,Lisa Simpson: Zee-boo. Zee-boo.,239000,True,9,5.0,Lisa Simpson,Simpson Home,Zee-boo. Zee-boo.,zee-boo zee-boo,2.0
3,10372,35,33,Lisa Simpson: I'm trying to teach Maggie that ...,245000,True,9,5.0,Lisa Simpson,Simpson Home,I'm trying to teach Maggie that nature doesn't...,im trying to teach maggie that nature doesnt e...,24.0
4,10374,35,35,"Lisa Simpson: It's like an ox, only it has a h...",254000,True,9,5.0,Lisa Simpson,Simpson Home,"It's like an ox, only it has a hump and a dewl...",its like an ox only it has a hump and a dewlap...,18.0


In [5]:
phrases = simpson_data.loc[:, 'normalized_text'].to_list()

In [6]:
phrases[:3]

['maggie look whats that', 'lee-mur lee-mur', 'zee-boo zee-boo']

In [7]:
phrases_cleaned = [*filter(lambda el: isinstance(el, str), phrases)]

In [8]:
text = [[sym for sym in p] for p in phrases if isinstance(p, str)]

In [107]:
CHARS = sorted(set('abcdefghijklmnopqrstuvwxyz '))
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]
CHAR_TO_INDEX = {i:w for w, i in enumerate(INDEX_TO_CHAR)}

In [10]:
max_len = 50 
X = torch.zeros([len(phrases), max_len], dtype=int)
for i, sentence in enumerate(phrases_cleaned):
    for j, num in enumerate([CHAR_TO_INDEX.get(letter, CHAR_TO_INDEX['none']) for letter in sentence[:50] if isinstance(letter, str)]):
        X[i, j] = num

In [108]:
CHAR_TO_INDEX

{'none': 0,
 ' ': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27}

In [13]:
phrases_cleaned[0]

'maggie look whats that'

In [11]:
X[0]

tensor([20, 19, 26, 26, 24,  8, 16,  1, 14, 14, 23, 16, 18,  9, 19,  7, 10, 16,
         7,  9, 19,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

### Embedding and RNN layer

In [22]:
embedding = nn.Embedding(len(CHAR_TO_INDEX), 28)
t = embedding(X[:2])

In [29]:
t.shape, X[:2].shape

(torch.Size([2, 50, 28]), torch.Size([2, 50]))

In [31]:
t

tensor([[[ 0.4012,  0.7452, -0.6741,  ...,  0.4302,  0.4272, -0.7302],
         [-0.9359,  0.9528,  0.7382,  ..., -0.5555, -0.5714, -0.9304],
         [ 0.9221,  0.8647, -0.2065,  ..., -1.2585,  0.2440, -0.4772],
         ...,
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532],
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532],
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532]],

        [[ 1.1126,  0.7447,  0.2865,  ..., -0.7506,  1.3584,  1.0794],
         [ 0.8599,  0.1234,  1.0468,  ...,  1.0508, -1.1131,  2.5102],
         [ 0.8599,  0.1234,  1.0468,  ...,  1.0508, -1.1131,  2.5102],
         ...,
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532],
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532],
         [-0.6144, -1.4551,  0.7483,  ...,  0.3575,  1.8186, -0.4532]]],
       grad_fn=<EmbeddingBackward0>)

In [30]:
X[:2]

tensor([[20, 19, 26, 26, 24,  8, 16,  1, 14, 14, 23, 16, 18,  9, 19,  7, 10, 16,
          7,  9, 19,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  8,  8,  0, 20, 12, 21, 16,  1,  8,  8,  0, 20, 12, 21,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]])

In [32]:
rnn = nn.RNN(28, 128, batch_first=True)
o, s = rnn(t)
o.shape, s.shape

(torch.Size([2, 50, 128]), torch.Size([1, 2, 128]))

In [33]:
o

tensor([[[-0.1177,  0.1854,  0.1870,  ..., -0.1068,  0.0458, -0.2833],
         [-0.1194,  0.3448,  0.0483,  ..., -0.0972, -0.0458,  0.6143],
         [-0.2528,  0.2589, -0.4974,  ..., -0.1175,  0.1720, -0.0871],
         ...,
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079],
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079],
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079]],

        [[-0.2515,  0.2929, -0.2898,  ..., -0.0027, -0.0113,  0.2344],
         [-0.5529,  0.1068,  0.2482,  ...,  0.3495, -0.2969, -0.3739],
         [-0.4666, -0.1370, -0.0140,  ...,  0.5581, -0.4114, -0.4334],
         ...,
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079],
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079],
         [ 0.2580,  0.0861,  0.5146,  ...,  0.1763, -0.1065,  0.2079]]],
       grad_fn=<TransposeBackward1>)

In [34]:
s

tensor([[[ 2.5798e-01,  8.6096e-02,  5.1462e-01,  2.7532e-01, -1.2804e-02,
          -5.5907e-02,  1.9148e-01, -1.1949e-01,  2.4506e-01,  1.2066e-01,
           1.7921e-01, -2.7090e-01,  4.6904e-02, -2.4554e-01,  1.8467e-01,
          -2.5699e-01,  4.4413e-01, -4.5050e-02,  4.3491e-01,  1.9214e-01,
           1.0260e-01, -3.7560e-01,  1.7719e-01, -1.8062e-01, -3.9982e-01,
           3.3827e-01, -1.9555e-02, -2.0602e-01, -3.2931e-01, -1.3029e-01,
           2.0961e-02, -3.3736e-02, -3.6602e-01,  1.1025e-01, -9.4655e-02,
          -9.0530e-02,  1.4991e-01,  1.4714e-03,  1.1206e-01, -3.6420e-02,
           7.0558e-02, -4.0003e-01,  2.3466e-01, -8.3561e-02,  7.7733e-02,
          -1.6386e-01,  2.0178e-01,  2.7132e-02,  2.2086e-01,  6.1084e-02,
          -2.9655e-01, -3.3372e-01, -7.7902e-02,  1.3531e-01,  1.5415e-02,
           1.0794e-01,  1.8747e-01, -2.7202e-01, -3.1681e-01,  1.7532e-01,
           9.3117e-02,  1.2843e-01, -2.4375e-01,  4.1192e-01,  1.1663e-02,
           2.0492e-01,  2

In [42]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embed = nn.Embedding(len(CHAR_TO_INDEX), 28)
        self.rnn = nn.RNN(28, 128, batch_first=True)
        self.linear = nn.Linear(128, len(CHAR_TO_INDEX))
        
    def forward(self, sentences, state=None):
        embed = self.embed(sentences)
        o, s = self.rnn(embed)
        out = self.linear(o)
        return out 

In [44]:
model = Network()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [None]:
CHAR_TO_INDEX

In [None]:
def generate_sentence():
    sentence = [*'hello']
    max_len = 50 
    x = torch.zeros([1, len(sentence)], dtype=int)
    
    for j, num in enumerate(sentence):
        if j >= max_len:
            break
        x[0, j] = CHAR_TO_INDEX.get(num, CHAR_TO_INDEX['none'])
    
    for i in range(max_len):
        o = model(x)
        l = torch.argmax(o[-1, -1, :], keepdim=True)
        x = torch.cat([x, l.unsqueeze(0)], axis=1)
        
        ll = INDEX_TO_CHAR[l]
        
        if ll == 'none':
            break
            
        sentence.append(ll)
        
    return ''.join(sentence)

In [None]:
for ep in range(200 + 1):
    start = time.time()
    train_loss = 0.
    train_passed = 0 
    
    for i in range(int(len(X)/ 100)):
        batch = X[i*100:(i+1)*100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()
        
        optimizer.zero_grad()
        
        answers = model(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        train_passed += 1
        
    
    if ep%5 == 0:
        print(f'\nEpoch {ep}, Time: {time.time() - start:.3f}, Train loss: {train_loss / train_passed}')
        s = generate_sentence()
        print(s)
    else:
        print(f'\nEpoch {ep}, Time: {time.time() - start:.3f}, Train loss: {train_loss / train_passed}')

## Cezar 

In [None]:
phrases[:10]

#### Сдвиг на 1

In [402]:
alphabet = set('abcdefghijklmnopqrstuvwxyz')

In [403]:
orig = np.array(sorted([*alphabet]))
shifted = np.roll(orig, 13)

In [404]:
orig_shift = {l_or:l_sh for l_or, l_sh in zip(orig, shifted)}
orig_shift

{'a': 'n',
 'b': 'o',
 'c': 'p',
 'd': 'q',
 'e': 'r',
 'f': 's',
 'g': 't',
 'h': 'u',
 'i': 'v',
 'j': 'w',
 'k': 'x',
 'l': 'y',
 'm': 'z',
 'n': 'a',
 'o': 'b',
 'p': 'c',
 'q': 'd',
 'r': 'e',
 's': 'f',
 't': 'g',
 'u': 'h',
 'v': 'i',
 'w': 'j',
 'x': 'k',
 'y': 'l',
 'z': 'm'}

In [405]:
shifted_phrases = [''.join([orig_shift.get(i, ' ') for i in phrase]) for phrase in phrases_cleaned]

In [406]:
max_len = 50 
Y = torch.zeros([len(phrases_cleaned), max_len], dtype=int)
for i, sentence in enumerate(phrases_cleaned):
    for j, num in enumerate([CHAR_TO_INDEX.get(letter, CHAR_TO_INDEX['none']) for letter in sentence[:50] if isinstance(letter, str)]):
        Y[i, j] = num

In [407]:
max_len = 50 
X = torch.zeros([len(shifted_phrases), max_len], dtype=int)
for i, sentence in enumerate(shifted_phrases):
    for j, num in enumerate([CHAR_TO_INDEX.get(letter, CHAR_TO_INDEX['none']) for letter in sentence[:50] if isinstance(letter, str)]):
        X[i, j] = num

In [408]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embed = nn.Embedding(len(CHAR_TO_INDEX), 28)
        self.rnn = nn.RNN(28, 128, batch_first=True)
        self.linear = nn.Linear(128, len(CHAR_TO_INDEX))
        
    def forward(self, sentences, state=None):
        embed = self.embed(sentences)
        output, hn = self.rnn(embed)
        out = self.linear(output)
        return out 

In [409]:
model = Network()

In [410]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)

In [411]:
for ep in range(10 + 1):
    start = time.time()
    train_loss = 0.
    train_passed = 0 
    
    for i in range(int(len(X)/ 100)):
        batch_X = X[i*100:(i+1)*100]
        batch_Y = Y[i*100:(i+1)*100]
        X_batch = batch_X[:, :]
        Y_batch = batch_Y[:, :].flatten()
        
        optimizer.zero_grad()
        
        answers = model(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        train_passed += 1
        
    print(f'\nEpoch {ep}, Time: {time.time() - start:.3f}, Train loss: {train_loss / train_passed}')


Epoch 0, Time: 2.314, Train loss: 1.2632088531498555

Epoch 1, Time: 2.259, Train loss: 0.42562279132781206

Epoch 2, Time: 2.251, Train loss: 0.23060751278643254

Epoch 3, Time: 2.266, Train loss: 0.14830621242247247

Epoch 4, Time: 2.314, Train loss: 0.104971952839858

Epoch 5, Time: 2.337, Train loss: 0.07978023257520464

Epoch 6, Time: 2.251, Train loss: 0.06420968983460355

Epoch 7, Time: 2.315, Train loss: 0.05399840632108627

Epoch 8, Time: 2.315, Train loss: 0.04690321621105627

Epoch 9, Time: 2.282, Train loss: 0.04171263141971496

Epoch 10, Time: 2.324, Train loss: 0.0377482472298046


In [417]:
text = """
species of flowering plants with showy flowers. It takes its name from the Greek word for a rainbow, which is also the name for the Greek goddess of the rainbow, Iris. Some authors state that the name refers to the wide variety of flower colors found among the many species.[3] As well as being the scientific name, iris is also widely used as a common name for all Iris species, as well as some belonging to other closely related genera. A common name for some species is 'flags', while the plants of the subgenus Scorpiris are widely known as 'junos', particularly in horticulture. It is a popular garden flower.
"""

text_norm = ''.join([letter for letter in text.lower().strip() if letter in INDEX_TO_CHAR])
test_phrase = ''.join([orig_shift.get(i, ' ') for i in text_norm])

In [418]:
max_len = 50 
X_test = torch.zeros([1, max_len], dtype=int)
for j, num in enumerate([CHAR_TO_INDEX.get(letter, CHAR_TO_INDEX['none']) for letter in test_phrase]):
    if j >= max_len:
        break
    X_test[0, j] = num

In [419]:
for i, j in zip( X_test[0], torch.argmax(model(X_test)[0], axis=1) ):
    print(INDEX_TO_CHAR[i], INDEX_TO_CHAR[j])

f s
c p
r e
p c
v i
r e
f s
   
b o
s f
   
s f
y l
b o
j w
r e
e r
v i
a n
t g
   
c p
y l
n a
a n
g t
f s
   
j w
v i
g t
u h
   
f s
u h
b o
j w
l y
   
s f
y l
b o
j w
r e
e r
f s
   
v i
g t
   
