<a href="https://colab.research.google.com/github/andriusrak/vardu-generatorius/blob/main/Vardugeneratorius2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import requests
from bs4 import BeautifulSoup
import numpy as np

names_vyro = []
names_moters = []
for key in ['a', 'b', 'c', 'c-2', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
            'm', 'n', 'o', 'p', 'r', 's', 's-2', 't', 'u', 'v', 'z', 'z-2']:
    url_vyro = f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=vyro&kilme='
    url_moters =f'https://vardai.vlkk.lt/sarasas/{key}/?lytis=moters&kilme='

    #gaunam vyrisku vardu list
    response = requests.get(url_vyro)
    soup = BeautifulSoup(response.text, 'html.parser')
    vyro_links = soup.find_all('a', class_='names_list__links names_list__links--man')
    names_vyro += [name.text for name in vyro_links]

    #gaunam moterisku vardu list
    response = requests.get(url_moters)
    soup = BeautifulSoup(response.text, 'html.parser')
    moters_links = soup.find_all('a', class_='names_list__links names_list__links--woman')
    names_moters += [name.text for name in moters_links]


np.savetxt('vyru_vardai.txt', names_vyro, fmt='%s', header='name', comments='', newline='\n')
np.savetxt('moteru_vardai.txt', names_moters, fmt='%s', header='name', comments='', newline='\n')

#sujungtas failas
all_names = names_vyro + names_moters
np.savetxt('vardai.txt', all_names, fmt='%s', header='name', comments='', newline='\n')

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [18]:
class NameDataset(Dataset):
    def __init__(self, male_file, female_file):
        # Read both male and female names
        male_names = pd.read_csv(male_file)['name'].values
        female_names = pd.read_csv(female_file)['name'].values

        # Combine names with gender labels (0 for male, 1 for female)
        #self.names = [(name, 0) for name in male_names] + [(name, 1) for name in female_names]
        #
        # Lowercase raides padarom
        self.names = [(name.lower(), 0) for name in male_names] + [(name.lower(), 1) for name in female_names]



        # Create character vocabulary
        all_names = [name for name, _ in self.names]
        self.chars = sorted(list(set(''.join(all_names) + ' ')))
        self.char_to_int = {c: i for i, c in enumerate(self.chars)}
        self.int_to_char = {i: c for c, i in self.char_to_int.items()}
        self.vocab_size = len(self.chars)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name, gender = self.names[idx]
        name = name + ' '  # Adding padding character
        encoded_name = [self.char_to_int[char] for char in name]
        return torch.tensor(encoded_name), torch.tensor(gender, dtype=torch.long)

In [19]:
def pad_collate(batch):
    # Separate names and genders
    names = [item[0] for item in batch]
    genders = torch.stack([item[1] for item in batch])

    padded_seqs = pad_sequence(names, batch_first=True, padding_value=0)
    input_seq = padded_seqs[:, :-1]
    target_seq = padded_seqs[:, 1:]

    return input_seq, target_seq, genders

In [20]:
class GenderAwareTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, forward_expansion):
        super(GenderAwareTransformer, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.gender_embed = nn.Embedding(2, embed_size)  # 2 for male/female
        self.positional_encoding = nn.Parameter(torch.randn(1, 100, embed_size))
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_size, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
        self.output_layer = nn.Linear(embed_size, vocab_size)

    def forward(self, x, gender):
        # Get embeddings
        char_embeddings = self.embed(x)
        gender_embeddings = self.gender_embed(gender).unsqueeze(1).expand(-1, x.size(1), -1)

        # Combine character and gender embeddings
        x = char_embeddings + gender_embeddings + self.positional_encoding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = self.output_layer(x)
        return x

In [22]:
def train_model(model, dataloader, epochs=200):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        batch_count = 0

        for batch_idx, (input_seq, target_seq, genders) in enumerate(dataloader):
            optimizer.zero_grad()
            output = model(input_seq, genders)
            loss = criterion(output.transpose(1, 2), target_seq)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            batch_count += 1

        average_loss = total_loss / batch_count
        print(f'Epoch {epoch+1}, Average Loss: {average_loss}')

In [27]:
def sample(model, dataset, gender, start_str='a', max_length=20):
    model.eval()
    with torch.no_grad():
        start_str = start_str.lower()

        chars = [dataset.char_to_int[c] for c in start_str]
        input_seq = torch.tensor(chars).unsqueeze(0)
        gender_tensor = torch.tensor([gender])  # 0 for male, 1 for female

        output_name = start_str
        for _ in range(max_length - len(start_str)):
            output = model(input_seq, gender_tensor)
            probabilities = torch.softmax(output[0, -1], dim=0)
            next_char_idx = torch.multinomial(probabilities, 1).item()
            next_char = dataset.int_to_char[next_char_idx]

            if next_char == ' ':
                break

            output_name += next_char
            input_seq = torch.cat([input_seq, torch.tensor([[next_char_idx]])], dim=1)

        return output_name.capitalize()


In [24]:
# Create dataset with both male and female names
dataset = NameDataset('vyru_vardai.txt', 'moteru_vardai.txt')
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)

# Initialize and train the model
model = GenderAwareTransformer(
    vocab_size=dataset.vocab_size,
    embed_size=128,
    num_heads=8,
    forward_expansion=4
)
train_model(model, dataloader)



Epoch 1, Average Loss: 1.461242005287894
Epoch 2, Average Loss: 1.3170617258124673
Epoch 3, Average Loss: 1.2948501585971697
Epoch 4, Average Loss: 1.2899656693925972
Epoch 5, Average Loss: 1.276165348739021
Epoch 6, Average Loss: 1.2772356839519245
Epoch 7, Average Loss: 1.2679237895332307
Epoch 8, Average Loss: 1.263676757397859
Epoch 9, Average Loss: 1.2578558964220432
Epoch 10, Average Loss: 1.2571673562875378
Epoch 11, Average Loss: 1.2514584620479539
Epoch 12, Average Loss: 1.2486138025762534
Epoch 13, Average Loss: 1.2487417801095564
Epoch 14, Average Loss: 1.2472308966010928
Epoch 15, Average Loss: 1.2416137812637058
Epoch 16, Average Loss: 1.245443407254728
Epoch 17, Average Loss: 1.2426347400359956
Epoch 18, Average Loss: 1.2407919378148708
Epoch 19, Average Loss: 1.2380455395449763
Epoch 20, Average Loss: 1.2369363765000354
Epoch 21, Average Loss: 1.2369518025590498
Epoch 22, Average Loss: 1.2365770851199336
Epoch 23, Average Loss: 1.2389519238189275
Epoch 24, Average Loss: 

In [32]:
# Generate examples
print("Vyru vardai:")
for _ in range(5):
    name = sample(model, dataset, gender=0, start_str='f')
    print(name)

print("\nMoteru vardai:")
for _ in range(5):
    name = sample(model, dataset, gender=1, start_str='L')
    print(name)

Vyru vardai:
Fruelis
Fugovius
Fìlvintas
Flažydo
Fèdodas

Moteru vardai:
Lice
Lafrygė
Ludùzana
Labetndà
Lĩskonda


In [30]:
# Save model
torch.save(model.state_dict(), 'name_model.pt')

# Save mappings
mappings = {
    'char_to_int': dataset.char_to_int,
    'int_to_char': {str(k): v for k, v in dataset.int_to_char.items()},
    'vocab_size': dataset.vocab_size
}
import json
with open('name_mappings.json', 'w', encoding='utf-8') as f:
    json.dump(mappings, f, ensure_ascii=False, indent=2)

padaryt kad lowercase butu, o output pirma raide auto padidina. istrainint ant daugiau epochu