Getting and "cleaning" the dataset

In [12]:
import requests
from bs4 import BeautifulSoup

m_names = []
f_names = []
for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url = f'https://vardai.vlkk.lt/sarasas/{key}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    m_links = soup.find_all('a', class_='names_list__links names_list__links--man')
    f_links = soup.find_all('a', class_='names_list__links names_list__links--woman')
    m_names += [m_name.text for m_name in m_links]
    f_names += [f_name.text for f_name in f_links]

In [6]:
# Combine both lists
combined_lists = m_names + f_names

# Get all unique characters
unique_chars = set("".join(combined_lists))

# Sort the characters for better readability (optional)
unique_chars = sorted(unique_chars)

# Print the result
print("Unique characters:", unique_chars)


Unique characters: ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ą', 'č', 'ė', 'ę', 'š', 'ū', 'ž']


In [4]:
def clean_names(names):
  # Define the character mappings, including capital letters to lowercase
  char_mappings = {
      'Á': 'a',
      'Ã': 'a',
      'È': 'e',
      'Ì': 'i',
      'Ò': 'o',
      'Õ': 'o',
      'Ù': 'u',
      'à': 'a',
      'á': 'a',
      'ã': 'a',
      'è': 'e',
      'é': 'e',
      'ì': 'i',
      'ñ': 'n',
      'ò': 'o',
      'ó': 'o',
      'õ': 'o',
      'ù': 'u',
      'ý': 'y',
      'Ą': 'ą',
      'Č': 'č',
      'Ė': 'ė',
      'ĩ': 'i',
      'Š': 'š',
      'ũ': 'u',
      'Ū': 'ū',
      'Ž': 'ž',
      'Ẽ': 'e',
      'ẽ': 'e',
      'ỹ': 'y',
      '̀': '',
      '́': '',
      '̃': '',
      }

  # Add uppercase-to-lowercase mappings
  char_mappings.update({chr(i): chr(i).lower() for i in range(ord('A'), ord('Z') + 1)})

  # Replace unwanted characters
  def replace_chars(name, mappings):
     for old_char, new_char in mappings.items():
         name = name.replace(old_char, new_char)
     return name

  # Apply the replacement to each name
  names_cleaned = [replace_chars(name.strip(), char_mappings) for name in names]
  return names_cleaned

In [16]:
m_names = clean_names(m_names)
#m_names = [f"M{name}" for name in m_names]
f_names = clean_names(f_names)
#f_names = [f"F{name}" for name in f_names]
#names = m_names + f_names

The model acts a bit weird when female start letter is ė. And we can clearly see why.

In [20]:
count = len(list(filter(lambda item: item.startswith("ė"), f_names)))
print(f"Number of strings starting with 'ė': {count}")

Number of strings starting with 'ė': 0


In [None]:
# Save list to a .txt file
with open("vardai_female", "w") as file:
    for item in f_names:
        file.write(f"{item}\n")

with open("vardai_male", "w") as file:
    for item in m_names:
        file.write(f"{item}\n")

#with open("vardai", "w") as file:
#    for item in names:
#        file.write(f"{item}\n")

# Download the file to your local system
from google.colab import files
files.download("vardai_male")
files.download("vardai_female")
#files.download("vardai")

# add names as the first entry

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The model is pretty much the same as the one used in the lecutre

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [None]:
# Adjusted NameDataset
class NameDataset(Dataset):
    def __init__(self, csv_file):
        self.names = pd.read_csv(csv_file)['name'].values
        self.chars = sorted(list(set(''.join(self.names) + ' ')))  # Including a padding character
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx] + ' '  # Adding padding character at the end
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name)

# Custom collate function for padding
def pad_collate(batch):
    padded_seqs = pad_sequence(batch, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]
    return input_seq, target_seq

csv_file_male = 'vardai_male'
csv_file_female = 'vardai_female'
#csv_file = 'vardai'

#dataset = NameDataset(csv_file)
dataset_male = NameDataset(csv_file_male)
dataset_female = NameDataset(csv_file_female)

dataloader_male = DataLoader(dataset_male, batch_size=32, shuffle=True, collate_fn=pad_collate)
dataloader_female = DataLoader(dataset_female, batch_size=32, shuffle=True, collate_fn=pad_collate)

In [None]:
device = "cuda"

In [None]:
# Minimal Transformer Model
class MinimalTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(MinimalTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=4)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        positions = torch.arange(0, x.size(1)).unsqueeze(0)
        x = self.embed(x) + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

# Training Loop
def train_model(model, dataloader, epochs=30):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()  # Ensure the model is in training mode
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq) in enumerate(dataloader):
            input_seq, target_seq = input_seq.to(device), target_seq.to(device)
            optimizer.zero_grad()
            output = model(input_seq)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

In [None]:
modelM = MinimalTransformer(vocab_size=dataset_male.vocab_size, embed_size=128, num_heads=8, forward_expansion=4).to(device)
train_model(modelM, dataloader_male)

Epoch 1, Average Loss: 1.362299338845182
Epoch 2, Average Loss: 1.194868588743131
Epoch 3, Average Loss: 1.1847481067515602
Epoch 4, Average Loss: 1.1654071591117166
Epoch 5, Average Loss: 1.152295225415348
Epoch 6, Average Loss: 1.1525752145396777
Epoch 7, Average Loss: 1.1441421966907406
Epoch 8, Average Loss: 1.1326697444127611
Epoch 9, Average Loss: 1.138304806937856
Epoch 10, Average Loss: 1.1292295436228603
Epoch 11, Average Loss: 1.130467475938403
Epoch 12, Average Loss: 1.1189188193683781
Epoch 13, Average Loss: 1.1207558887063964
Epoch 14, Average Loss: 1.12533748396172
Epoch 15, Average Loss: 1.1261435811184655
Epoch 16, Average Loss: 1.1184927254668937
Epoch 17, Average Loss: 1.1197640053496873
Epoch 18, Average Loss: 1.114803169877076
Epoch 19, Average Loss: 1.1093780048622572
Epoch 20, Average Loss: 1.1183522442155633
Epoch 21, Average Loss: 1.11354104546476
Epoch 22, Average Loss: 1.105784262507415
Epoch 23, Average Loss: 1.1102416559684376
Epoch 24, Average Loss: 1.10783

In [None]:
modelF = MinimalTransformer(vocab_size=dataset_female.vocab_size, embed_size=128, num_heads=8, forward_expansion=4).to(device)
train_model(modelF, dataloader_female)

Epoch 1, Average Loss: 1.4203582016148961
Epoch 2, Average Loss: 1.2449012555574115
Epoch 3, Average Loss: 1.2238642232758659
Epoch 4, Average Loss: 1.2059936460695768
Epoch 5, Average Loss: 1.1959791071432875
Epoch 6, Average Loss: 1.193355529380024
Epoch 7, Average Loss: 1.1857151447382188
Epoch 8, Average Loss: 1.1847558886484992
Epoch 9, Average Loss: 1.1774727724548568
Epoch 10, Average Loss: 1.1711439508244508
Epoch 11, Average Loss: 1.1724663497810077
Epoch 12, Average Loss: 1.16547133331012
Epoch 13, Average Loss: 1.1635712273138807
Epoch 14, Average Loss: 1.1634415101287956
Epoch 15, Average Loss: 1.1562715208620058
Epoch 16, Average Loss: 1.1620068406700192
Epoch 17, Average Loss: 1.1642670770336812
Epoch 18, Average Loss: 1.1616523113465846
Epoch 19, Average Loss: 1.1546482977114225
Epoch 20, Average Loss: 1.1660183242389135
Epoch 21, Average Loss: 1.1604627326018828
Epoch 22, Average Loss: 1.1505942344665527
Epoch 23, Average Loss: 1.1552493428825437
Epoch 24, Average Loss:

Note: the actual app uses the sample function with temperature.
I could not get the model to distinguish between genders.

In [None]:
def sample(model, dataset, start_str='a', max_length=20):
    model.eval()  # Switch to evaluation mode
    with torch.no_grad():
        # Convert start string to tensor
        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0).to(device)  # Add batch dimension

        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq.to(device)).to('cpu')

            # Get the last character from the output
            probabilities = torch.softmax(output[0, -1], dim=0)
            # Sample a character from the probability distribution
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ':  # Assume ' ' is your end-of-sequence character
                break

            output_name += next_char
            # Update the input sequence for the next iteration
            input_seq = torch.cat([input_seq.to('cpu'), torch.tensor([[next_char_idx]])], dim=1)

        return output_name

Farma
Farvaniju
Farmila
Farvida
Fargva
Farvuzija
Farinanama
Farga
Faragenas
Farcna


In [None]:
# After training your model, generate a name starting with a specific letter
for _ in range(10):
    generated_name = sample(modelM, dataset_male, start_str='ar')
    print(generated_name)

ardviudas
arsmantijudas
arvis
argineolmas
arivaldas
arofas
arilintas
arolijus
arriolmas
arnijus


In [None]:
# After training your model, generate a name starting with a specific letter
for _ in range(10):
    generated_name = sample(modelF, dataset_female, start_str='a')
    print(generated_name)

aoranta
adasa
aive
anainė
avė
aeudonia
arestė
afrmboda
alelėja
aremaretė


In [None]:
import json

torch.save(modelM, '../namesformer_model_male.pt')
torch.save(modelF, '../namesformer_model_female.pt')


with open('../int_to_char_female.json', 'w') as f:
    json.dump(dataset_female.int_to_char, f)

with open('../char_to_int_female.json', 'w') as f:
    json.dump(dataset_female.char_to_int, f)

with open('../int_to_char_male.json', 'w') as f:
    json.dump(dataset_male.int_to_char, f)

with open('../char_to_int_male.json', 'w') as f:
    json.dump(dataset_male.char_to_int, f)