# Genetic for genomics

In this notebook, we aim at fininding the best set of hyper parametter for our different models.

We will start with a single type of model to keep things simple.


In [1]:
import sys
import importlib
import pickle
import random

# data manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# data analysis
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# pytorch specific
import torch
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

# project specific
sys.path.append('../')
from src import config
from src.utils import visualisation, benchmark, helpers, monitoring, experiment
from src.models import model


from matplotlib.animation import FuncAnimation
from IPython.display import HTML

pd.options.display.width = 1000
DEVICE = torch.device(config["DEVICE"])

In [2]:
# We first set the parametters that we do no want to change

data_params = { 
    "LS_threshold" : 0.0023,
    "MAD_threshold" : 1, 
    "MT_removal" : True, 
    "expression_threshold" : 0.1}

# or we can use a pre loaded dataset to save time
data_params = '../workfiles/light_BRCA_ds.pkl'

model_params = {
    "variational" : False,
    "convolution": False,
    "transformer" : False
    }

dynamic_params = {
    "dropout" : [0.1, 0.3, 0.5],
    "latent_dim": [16, 32, 64, 128],
    # ... (any other uncoupled parameters)
    # Coupled parameters are passed as tuples within the list
    "conv_params": [
        (("padding", 3), ("kernel_size", 7)),
        (("padding", 2), ("kernel_size", 5)),
        (("padding", 1), ("kernel_size", 3))
    ]
    # Note: The key "conv_params" is only a placeholder and won't appear in the final configs.
}

all_config = helpers.generate_config(model_params, dynamic_params)
print(len(all_config))

36


In [3]:
# Hyper param
EPOCH = 3000


# Initialize GA parameters
population_size = 100
mutation_rate = 0.01  # This is usually a small chance
mutation_rate = 0.5  # Every run is so costly that we have to tune it up
crossover_rate = 0.7  # This is usually high -> fine by me.
generations = 5


# Step 1: Create the initial population
population = random.sample(all_config, 8)

In [4]:
def calculate_fitness(individual):
    e = experiment.Experiment(data_param=data_params, model_param=individual, n_epoch = EPOCH)
    e.run()    
    return e.metric

def select_parents(population):
    # Assuming your population is a list of individuals and 
    # you can retrieve the fitness of each individual, possibly through a function call.

    # First, sort the population by fitness. I'm assuming higher fitness is better.
    # If your fitness measure works the other way, you can reverse the sort.
    sorted_population = sorted(population, key=calculate_fitness, reverse=True)

    # Now, select the two fittest individuals. If you prefer, you could also add some
    # stochastic behavior in this selection (e.g., sometimes choosing individuals
    # other than the absolute fittest).
    parent1 = sorted_population[0]
    parent2 = sorted_population[1] if sorted_population[1] != parent1 else sorted_population[2]

    return parent1, parent2

# most basic crossover possible
def crossover(parent1, parent2):
    child = {}
    for key in parent1.keys():
        child[key] = parent1[key] if random.choice([True, False]) else parent2[key]
    
    return child

def mutate(individual, dynamic_params = dynamic_params):
    # Choose a parameter to mutate
    mutation_param = random.choice(list(dynamic_params.keys()))

    # Choose a new value for the parameter from the provided list, making sure it's not the same as the current value
    current_value = individual[mutation_param]
    available_choices = [val for val in dynamic_params[mutation_param] if val != current_value]

    # If there are no available choices (e.g., list had only one element), no mutation happens
    if not available_choices:
        return individual

    new_value = random.choice(available_choices)

    # Apply the mutation
    mutated_individual = individual.copy()
    mutated_individual[mutation_param] = new_value

    return mutated_individual

In [6]:
importlib.reload(experiment)
# Main GA loop
for generation in range(generations):
    print(f"Running generation {generation + 1}...")

    # Calculate fitness for each individual
    population_fitness = [(individual, calculate_fitness(individual)) for individual in population]

    new_population = []
    while len(new_population) < population_size:
        # Selection
        parent1, parent2 = select_parents(population)

        # Crossover
        if random.random() < crossover_rate:
            child = crossover(parent1, parent2)
        else:
            # If no crossover, just select one of the parents at random for next generation
            child = random.choice([parent1, parent2])

        # Mutation
        if random.random() < mutation_rate:
            child = mutate(child)

        new_population.append(child)

    # Here, you may want to mix the new population with the old one and keep the best for next generation
    # Or completely replace it, depending on your strategy.

    population = new_population

# Your population now consists of the evolved solutions.
# You can select the best one as your final choice.
best_solution = max(population, key=calculate_fitness)


Running generation 1...
{'variational': False, 'convolution': False, 'transformer': False, 'dropout': 0.5, 'latent_dim': 16, 'padding': 3, 'kernel_size': 7}
input shape : 4819
{'PPMI_DATA_PATH': '/Users/aygalic/Thesis/data/quant/', 'PPMI_METADATA_PATH': '/Users/aygalic/Thesis/METADATA_200123.xlsx', 'CANCER_DATA_PATH': '/Users/aygalic/Thesis/data/cancer', 'BRCA_DATA_PATH': 'C:/thesis/data/BRCA/', 'BRCA_METADATA_PATH': 'C:/thesis/data/BRCA/metadata.cart.2023-09-22.json', 'BRCA_SUBTYPES_PATH': 'C:/thesis/data/BRCA/patient_subtype.tsv', 'DEVICE': 'cuda'}


KeyError: 'transformer'