### Prezentacja problemu

Celem projektu jest zbadanie, czy algorytmy genetyczne mogą posłużyć do dobrania optymalnych hiperparametrów dla sieci neuronowej. W przypadku tego projektu sieć ma postać perceptronu wielowarstwowego składającego się z jednej warstwy wejściowej (embedding layer), kilku warstw ukrytych i normalizacyjnych oraz jednej warstwy wyjściowej. \
Przykładowa architektura może wyglądać np. tak:

nn.Embedding(vocab_size, n_embd), nn.Flatten() \
nn.Linear(n_embd * block_size, n_hidden), norm_layer(n_hidden), activation_func \
nn.Linear(n_hidden, n_hidden),            norm_layer(n_hidden), activation_func \
nn.Linear(n_hidden, n_hidden),            norm_layer(n_hidden), activation_func \
nn.Linear(n_hidden, n_hidden),            norm_layer(n_hidden), activation_func \
nn.Linear(n_hidden, vocab_size),          norm_layer(vocab_size) \
nn.Softmax(vocab_size) 

Wszystkie modele zostały wytrenowane i przetestowane na zbiorze 32033 imion. Zadaniem perceptronu jest przewidzenie kolejnego znaku w sekwencji na podstawie trzech poprzednich tokenów. 

In [1]:
import torch
import torch.nn.functional as F
from torch import nn
import matplotlib.pyplot as plt 
import random 
import math
import numpy as np
import pygad

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
str_to_inx = {str:inx for inx, str in enumerate(chars, start=1)}
str_to_inx['.'] = 0
inx_to_str = {str:inx for inx, str in str_to_inx.items()}
vocab_size = len(inx_to_str)

# build the dataset
random.shuffle(words)
block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for word in words:
    word = block_size * '.' + word + '.'
    end_inx = block_size
    for start_inx, char in enumerate(word[block_size:]):
        X.append([str_to_inx[ch] for ch in word[start_inx:end_inx]])
        Y.append(str_to_inx[char])
        end_inx += 1


device = torch.device('cpu')

X, Y = torch.tensor(X).to(device), torch.tensor(Y).to(device)

# data set splits 80%, 10%, 10%
train_range = math.ceil(len(X) * 0.8)
dev_range = (len(X) - train_range) // 2

training_set = X[:train_range]
dev_set = X[train_range:train_range+dev_range]
test_set = X[train_range+dev_range:]

y_training_set = Y[:train_range]
y_dev_set = Y[train_range:train_range+dev_range]
y_test_set = Y[train_range+dev_range:]

assert training_set.nelement() + dev_set.nelement() + test_set.nelement() == X.nelement(), "Bad split"

In [3]:
# 5704

In [4]:
@torch.no_grad() # disable gradient tracking
def get_loss(model, data_set):
    model.eval()
    x, y = {
        'train': (training_set, y_training_set),
        'dev': (dev_set, y_dev_set),
        'test': (test_set, y_test_set)
    }[data_set]

    logits = model(x)
    loss = F.cross_entropy(logits, y)
    return loss.item()

In [5]:
def callback_generation(ga_instance):
    print(f"Generation = {ga_instance.generations_completed}")
    print(f"Fitness    = {ga_instance.best_solution()[1]}")

In [6]:
class MLP0(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits


class MLP1(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP2(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP3(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP4(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP5(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP6(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP7(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

class MLP8(nn.Module):
    def __init__(self, vocab_size, n_embd, n_hidden, norm_layer, activation):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Embedding(vocab_size, n_embd), nn.Flatten(),
            nn.Linear(n_embd * block_size, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, n_hidden, bias=False), norm_layer(n_hidden), activation,
            nn.Linear(n_hidden, vocab_size, bias=False), norm_layer(vocab_size)
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits
    

inputs = [MLP0, MLP1, MLP2, MLP3, MLP4, MLP5, MLP6, MLP7, MLP8]

In [7]:
# mappings
optim_map = {
    0: torch.optim.Adam,
    1: torch.optim.SGD,
}

act_map = {
    0: torch.nn.ReLU(),
    1: torch.nn.Tanh(),
    2: torch.nn.LeakyReLU(),
    3: torch.nn.Sigmoid(),
}

norm_map = {
    0: torch.nn.BatchNorm1d,
    1: torch.nn.LayerNorm,
}


In [8]:
def fitness_func_factory(inp_model):
    def fitness_func(solution, sol_idx):
        n_embd, n_hidden, norm_layer_num, activation_num, lr, batch_size, optimizer_num = solution
        activation = act_map[activation_num]
        norm_layer = norm_map[norm_layer_num]
        
        model = inp_model(vocab_size, int(n_embd), int(n_hidden), norm_layer, activation).to(device)

        optimizer = optim_map[optimizer_num](params=model.parameters(), lr=lr)

        loss_function = torch.nn.CrossEntropyLoss()
        g = torch.Generator().manual_seed(2147483647)
        steps = 750
        for _ in range(steps):
            inx = torch.randint(0, training_set.shape[0], (int(batch_size),), generator=g)
            emb = training_set[inx] # grab only those rows from the minibatch 

            optimizer.zero_grad()

            # forward pass
            preds = model(emb)
            loss = loss_function(preds, y_training_set[inx])

            # backward pass
            loss.backward()

            # update the weights
            optimizer.step()
            
        dev_loss = get_loss(model, 'dev')
        return 1.0 / (dev_loss + 1e-8)
    
    return fitness_func

In [9]:
num_generations = 2 # Number of generations.
num_parents_mating = 5 # Number of solutions to be selected as parents in the mating pool.
sol_per_pop = 10
mutation_num_genes = 1
keep_elitism = 2
parent_selection_type = "rws" # Type of parent selection.
crossover_type = "single_point"
gene_space = [  
                [8, 16, 32, 64],
                {'low': 16, 'high': 100, 'step': 1},
                [0,1],
                [0,1,2],
                np.linspace(0.0001, 0.1, 10),
                [16, 32, 64],
                [0,1],
            ]
num_genes = len(gene_space)

for curr_model in inputs:

    ga_instance = pygad.GA(num_generations=num_generations,
                        num_parents_mating=num_parents_mating,
                        num_genes=num_genes,
                        sol_per_pop=sol_per_pop,
                        keep_elitism=keep_elitism,
                        parent_selection_type=parent_selection_type,
                        fitness_func=fitness_func_factory(curr_model),
                        crossover_type=crossover_type,
                        gene_space=gene_space,
                        mutation_num_genes=mutation_num_genes,
                        on_generation=callback_generation,
                        )

    # Start the genetic algorithm evolution.
    ga_instance.run()

    ga_instance.plot_fitness(title="PyGAD & PyTorch - Iteration vs. Fitness", linewidth=4);
    solution = ga_instance.best_solution()[0]
    n_embd, n_hidden, norm_layer_num, activation_num, lr, batch_size, optimizer_num = solution
    activation = act_map[activation_num]
    norm_layer = norm_map[norm_layer_num]
    print(f"Best solution: \
        n_embd = {n_embd},\
        n_hidden_1 = {n_hidden},\
        norm_layer = {norm_map[norm_layer_num]},\
        activation = {act_map[activation_num]},\
        lr = {lr},\
        batch_size = {batch_size},\
        optimizer = {optim_map[optimizer_num]},\
    ")


AttributeError: module 'numpy' has no attribute 'float'.
`np.float` was a deprecated alias for the builtin `float`. To avoid this error in existing code, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations