# Genetic Algorithms based Knowledge Distillation

This notebook simply introduces a knowledge distillation application with a pretrained model built on Transformer architecture and uses genetics algorithms to isolate best fitted pretrained layers from base (teacher) model.

In [None]:
# import neural network libraries and utils
import torch
from torch.utils.data import DataLoader
from sentence_transformers import models, losses, evaluation
from sentence_transformers import (
    LoggingHandler,
    SentenceTransformer,
    util,
    InputExample,
)
from sentence_transformers.datasets import ParallelSentencesDataset
# import general utils and helper libraries
import random
import typing as t
from datetime import datetime
# import genetics algorithm modules and utils
import src.genetics.utils as ga_utils
from src.genetics.population import Population
from src.utils import (
    download_dataset,
    read_as_dataframe,
    ALLNLI_DATASET_URL,
    STS_BENCHMARK_DATASET_URL,
)

## Set parameters

You can set all related parameters with this project, including genetic hyperparameters.

In [None]:
# set utility variables
random_state = random.Random(42) # random state to get same result for every run this notebook
ga_utils.random_state = random_state # pass the same random state to genetics module as well
# set global variables
model_name = "all-MiniLM-L12-v2" # module to be distilled (ie. teacher model)
output_path = f"output/{model_name}_" + datetime.now().strftime(r"%Y-%m-%d_%H-%M-%S") # output path to save model file and evaluation results
max_train_samples = 1_000 # maximum number of training samples
train_batch_size = 32 # batch size for training
inference_batch_size = 32 # batch size for trained model
max_sentence_length = 256 # maximum char length for each sample (sentence) in the training set
### standard neural network hyperparameters ###
epochs = 1 
warmup_steps = 1000
evaluation_steps = 5000
learning_rate = 1e-4
epsilon = 1e-6
### standard neural network hyperparameters ###
# set hyperparameters for genetic algorithms
max_generation = 10 # maximum number of generations (ie. max iteration)
population_size = 10 # population size (ie. number of chromosome for each generation)
mutation_rate = 0.01 # mutation rate (%)
chromosome_length = 10 # number of layers (because each gene represents a layer's indice)
gene_value_range = (0, 12)  # value range for a gene => [a, b) means a is included while b is excluded. 

## Load teacher model

Teacher model is the base model to be distilled. We will select its encoder layers within genetics processes to train best distilled model.

In [None]:
teacher_model = SentenceTransformer(model_name)

## Data preparation

We will download the training and eval (ie. benchmark) datasets seperately and convert them to DataFrame.
We are using these datasets:
- For training: [**ALLNLI**](https://www.sbert.net/examples/datasets/README.html): Includes [SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) datsets
- For benchmark: [**STS Benchmark**](http://ixa2.si.ehu.eus/stswiki/index.php/Main_Page): STS Benchmark comprises a selection of the English datasets used in the STS tasks organized in the context of SemEval between 2012 and 2017.

In [None]:
def download_as_dataframe(url: str, download_path: str):
    download_dataset(url, download_path)
    return read_as_dataframe(download_path)


# download training dataset (ALLNLI)
training_ds = download_as_dataframe(
    ALLNLI_DATASET_URL,
    "datasets/allnli.tsv.gz",
)
# download evaluation (ie. benchmark) dataset (STSBENCHMARK)
benchmark_ds = download_as_dataframe(
    STS_BENCHMARK_DATASET_URL,
    "datasets/stsbenchmark.tsv.gz",
)

## Create train and benchmark evaluators

Evaluators are proper objects to pass through fit function of teacher models. It includes the dataset and eval functions.

In [None]:
# training evaluator
train_sents = training_ds[training_ds["split"] == "train"].loc[
    :, ["sentence1", "sentence2"]
]
X_train = list(
    set(train_sents["sentence1"].to_list() + train_sents["sentence2"].to_list())
)
random_state.shuffle(X_train)
X_train = X_train[:max_train_samples]  # limit train dataset
train_eval = evaluation.MSEEvaluator(
    X_train,
    X_train,
    teacher_model,
    name="allnli-train",
)

# benchmark evaluator
bench_sents = benchmark_ds[benchmark_ds["split"] == "dev"].loc[
    :, ["sentence1", "sentence2", "score"]
]
bench_samples = [
    InputExample(
        texts=[bench_sents.iloc[i, 0], bench_sents.iloc[i, 1]],
        label=(float(bench_sents.iloc[i, 2]) / 5.0),
    )
    for i in range(len(bench_sents))
]
bench_eval = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(
    bench_samples,
    name="sts-dev",
)

## Evaluate teacher model

We first evaluate the teacher model on benchmark dataaset.

In [None]:
teacher_eval_result = bench_eval(teacher_model)
print("Teacher model's benchmark result:", teacher_eval_result)

## Define fitness function

We define fitness function to pass genetics algorithms.

In [None]:
def get_student_model_from_layers(layers: t.List[int]) -> SentenceTransformer:
    """Create a student model same as teacher model with given layers indices.

    Args:
        layers (t.List[int]): List of layer indices.

    Returns:
        SentenceTransformer: Student model.
    """
    student_model = SentenceTransformer(model_name)
    auto_model = student_model._first_module().auto_model
    new_layers = torch.nn.ModuleList(
        [
            layer_module
            for i, layer_module in enumerate(auto_model.encoder.layer)
            if i in layers
        ]
    )
    auto_model.encoder.layer = new_layers
    auto_model.config.num_hidden_layers = len(layers)
    return student_model


def fitness_function(layers: t.List[int]) -> float:
    """Fitness (or object) function. 

    Args:
        layers (t.List[int]): List of layer indices.

    Returns:
        float: Benchmark eval result (rated with teacher model's result).
    """
    student_model = get_student_model_from_layers(layers)
    train_data = ParallelSentencesDataset(
        student_model=student_model,
        teacher_model=teacher_model,
        batch_size=inference_batch_size,
        use_embedding_cache=False,
    )
    train_data.add_dataset(
        [[sent] for sent in X_train],
        max_sentence_length=max_sentence_length,
    )
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.MSELoss(model=student_model)

    student_model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluation.SequentialEvaluator([bench_eval, train_eval]),
        epochs=epochs,
        warmup_steps=warmup_steps,
        evaluation_steps=evaluation_steps,
        output_path=output_path,
        optimizer_params={"lr": learning_rate, "eps": epsilon},
        save_best_model=False,
        use_amp=True,
    )
    return bench_eval(student_model) / teacher_eval_result

## Run genetic algorithms

Finally, we run the genetics processes to get best suited teacher model.

In [None]:
population = Population(
    population_size,
    mutation_rate,
    value_range=gene_value_range,
    length=chromosome_length,
    keep_best_chromosomes=True,
)

for i in range(max_generation):
    population.eval(fitness_function)
    population.update()
    print(population.local_best, population.local_best.fitness)
print(population.global_best, population.global_best.fitness)