In [1]:
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import DataLoader

from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.trainers import UnigramTrainer
from tokenizers.processors import TemplateProcessing

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

# Data Processing

In [3]:
data = pd.read_csv('data/pets_ru_en.csv')

data

Unnamed: 0,имя,язык
0,Acapella,en
1,Achilles,en
2,Adriana,en
3,Alpha,en
4,Alyssum,en
...,...,...
2908,Ярика,rus
2909,Яриска,rus
2910,Ярка,rus
2911,Яркиса,rus


In [4]:
max_len = data.имя.str.len().max()
max_len

22

In [5]:
model = Unigram()
tokenizer = Tokenizer(model)
tokenizer.normalizer = Lowercase()
tokenizer.post_processor = TemplateProcessing('$0 <EOS>', special_tokens=[("<EOS>", 3)])

tokenizer.enable_padding(direction='right', pad_id=0, pad_token='<PAD>', max_length=22)
tokenizer.enable_truncation(max_length=22)

trainer = UnigramTrainer(special_tokens=['<PAD>', '<SOS>', '<UNK>', '<EOS>', '<RU>', '<EN>'], 
                         unk_token='<UNK>', pad_token='<PAD>', max_piece_length=1)

tokenizer.train_from_iterator(data.имя, trainer=trainer)

In [6]:
tokenizer.get_vocab_size()

66

In [7]:
tokenizer.get_vocab()

{'q': 64,
 'п': 39,
 'w': 54,
 'с': 11,
 'у': 28,
 '-': 62,
 'ы': 59,
 'z': 57,
 'j': 61,
 'т': 22,
 'й': 50,
 '<UNK>': 2,
 'y': 38,
 'i': 17,
 'н': 13,
 'э': 42,
 's': 20,
 '<SOS>': 1,
 'a': 9,
 'h': 34,
 'ц': 56,
 'g': 41,
 'k': 48,
 'р': 16,
 'а': 6,
 'м': 26,
 'b': 36,
 'л': 10,
 '<EN>': 5,
 'ш': 31,
 'ф': 37,
 'ж': 51,
 'u': 29,
 'x': 58,
 'к': 14,
 'n': 19,
 ' ': 60,
 'c': 27,
 'f': 46,
 'ю': 53,
 '<PAD>': 0,
 'и': 7,
 '<RU>': 4,
 'o': 18,
 'з': 44,
 'б': 43,
 't': 23,
 'd': 33,
 'в': 45,
 'r': 12,
 'х': 52,
 'p': 30,
 'ё': 63,
 'щ': 65,
 'я': 25,
 'д': 35,
 'e': 8,
 '<EOS>': 3,
 'v': 55,
 'l': 21,
 'm': 32,
 'ь': 40,
 'ч': 49,
 'о': 24,
 'е': 15,
 'г': 47}

# Dataset Creation

In [8]:
class NameGenDataset():
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        x, lang = self.data.iloc[idx]
        x = torch.tensor(self.tokenizer.encode(x).ids)[:-1]
        x = torch.cat((torch.tensor([4 if lang == 'rus' else 5]), x))
        y = torch.cat((x[1:], torch.tensor([0])))

        return x, y

    def __len__(self):
        return len(self.data)

In [9]:
dataset = NameGenDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

In [10]:
x, y = next(iter(dataloader))

x.shape, y.shape

(torch.Size([128, 22]), torch.Size([128, 22]))

In [11]:
x

tensor([[ 5, 20, 18,  ...,  0,  0,  0],
        [ 5, 32,  9,  ...,  0,  0,  0],
        [ 4, 24, 37,  ...,  0,  0,  0],
        ...,
        [ 5, 27, 12,  ...,  0,  0,  0],
        [ 4, 25, 13,  ...,  0,  0,  0],
        [ 4, 37,  6,  ...,  0,  0,  0]])

In [12]:
y

tensor([[20, 18, 27,  ...,  0,  0,  0],
        [32,  9, 32,  ...,  0,  0,  0],
        [24, 37,  7,  ...,  0,  0,  0],
        ...,
        [27, 12, 29,  ...,  0,  0,  0],
        [25, 13, 25,  ...,  0,  0,  0],
        [37,  6, 13,  ...,  0,  0,  0]])

# Model

In [13]:
def train_loop(model, dataloader, loss, optimizer, verbose=False):
    model.train()
    epoch_loss = 0

    for ind, (X, y) in enumerate(dataloader):
        X, y = X.to(DEVICE), y.to(DEVICE)

        pred_logits, h = model(X)
        batch_loss = loss(pred_logits, y.flatten())

        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        epoch_loss += batch_loss.item()

        if verbose and ind % verbose == 0:
            print(f'Loss: {round(batch_loss.item(), 5)}')

    epoch_loss /= len(dataloader)

    if verbose:
        print(f'Train Loss: {round(epoch_loss, 5)}\n')

    return epoch_loss

In [14]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, batch_first=True)
        self.classifier = nn.Linear(hidden_size, vocab_size)

    def forward(self, X, h=None):
        emb_X = self.embedding(X)
        out, h = self.gru(emb_X, h)
        out = out.flatten(0, 1)
        logits = self.classifier(out)

        return logits, h

# Name Generation and Model Training

In [15]:
def generate_name(model, first_token='<RU>', max_len=22, ignore_context=False):
    model.eval()
    model.to('cpu')
    
    sequence = [tokenizer.token_to_id(first_token)]
    h = None
    with torch.no_grad():
        for i in range(1, max_len + 1):
            input_seq = torch.tensor([sequence[i-1]]).unsqueeze(0)
            
            pred_logits, h = model(input_seq, h)
            pred_logits = pred_logits.softmax(1)[-1].flatten()
            
            next_letter = torch.multinomial(pred_logits, 1)

            if next_letter == 3:
                break

            sequence.append(next_letter)

    sequence = tokenizer.decode(sequence).replace(' ', '')
    model.to(DEVICE)
    return ''.join(sequence).title()

In [16]:
model = RNNModel(tokenizer.get_vocab_size(), 200, 128).to(DEVICE)
loss = nn.CrossEntropyLoss(ignore_index=0)

lr = 1e-2
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

n_epochs = 60
print_every = 20
verbose_every = 10
verbose = False

model_weights_path = 'model_weights.pth'

In [17]:
for epoch in range(n_epochs + 1):
    if epoch % print_every == 0:
        print(f'EPOCH {epoch}\n---------------')
        verbose = 10
    else:
        verbose = False

    train_loss = train_loop(model, dataloader, loss, optimizer, verbose)

    if epoch % print_every == 0:
        for lang in ['<RU>', '<EN>']:
            names = [generate_name(model, lang) for i in range(3)]
            print(f'Generated {lang} Names:')
            for ind, name in enumerate(names):
                print(f'{ind + 1} - {name}')
        
        print()

torch.save(model.state_dict(), model_weights_path)

EPOCH 0
---------------
Loss: 4.21957
Loss: 2.47712
Loss: 2.43744
Train Loss: 2.67618

Generated <RU> Names:
1 - Розаррика
2 - Цейза
3 - Моняда
Generated <EN> Names:
1 - Ganflichern
2 - Kandanntpe
3 - Banlermannro

EPOCH 20
---------------
Loss: 1.43492
Loss: 1.51585
Loss: 1.55633
Train Loss: 1.50704

Generated <RU> Names:
1 - Элизма
2 - Чоша
3 - Джсефи
Generated <EN> Names:
1 - Tiny
2 - Chinmer
3 - Nytman

EPOCH 40
---------------
Loss: 1.30118
Loss: 1.30806
Loss: 1.38868
Train Loss: 1.32585

Generated <RU> Names:
1 - Люсик
2 - Жесика
3 - Гатэн
Generated <EN> Names:
1 - Pal
2 - Tоockle
3 - Seal

EPOCH 60
---------------
Loss: 1.21899
Loss: 1.28072
Loss: 1.31083
Train Loss: 1.27748

Generated <RU> Names:
1 - Царья
2 - Зетти
3 - Расума
Generated <EN> Names:
1 - Cato
2 - Bambi
3 - Puff



# Name Generation

In [18]:
model = RNNModel(tokenizer.get_vocab_size(), 200, 128).to(DEVICE)
model.load_state_dict(torch.load(model_weights_path, weights_only=True))

<All keys matched successfully>

In [19]:
n = 10
lang = '<RU>'

names = [generate_name(model, lang) for i in range(n)]
    
print(f'Generated {lang} Names:')
for ind, name in enumerate(names):
    print(f'{ind + 1} - {name}')

Generated <RU> Names:
1 - Санни
2 - Персида
3 - Пумка
4 - Темантея
5 - Дери
6 - Ханда
7 - Русалка
8 - Джеси
9 - Тапана
10 - Рыжинка


In [20]:
n = 10
lang = '<EN>'

names = [generate_name(model, lang) for i in range(n)]
    
print(f'Generated {lang} Names:')
for ind, name in enumerate(names):
    print(f'{ind + 1} - {name}')

Generated <EN> Names:
1 - Deesycaio
2 - Darlos
3 - Oscar
4 - Sesame
5 - Shaggy
6 - Chrystal
7 - Roаfaffer
8 - Khenelau
9 - Bling
10 - Qearma
