In [None]:
import pandas as pd
import numpy as np
from numpy.random import default_rng
import random
from itertools import chain
import pickle

import choices
import create_gene
import calculate_fitness
import select_mating_pool
import crossover

from tqdm import tqdm

rng = default_rng()

In [None]:
# create test subjects and test populations

n_genes = 500 # number of genes (population)

companies = {'Zeix': 20, 'SCS Supercomputing Systems': 20, 'Kyburz': 20, 'Siemens Mobility AG': 20, 'Varian': 20, 
             'MAN Energy Solutions':20, 'Universitätsspital Zürich': 20, 'Acht Grad Ost': 20, 'Weidmann': 20, 
             'True Wealth': 20, 'PartnerRe': 20, 'Sensirion': 20, 'Hocoma': 20}
uni1 = {'Architektur (ETH)': 30, 'Biologie (ETH)': 60, 'Raumbezogene Ingenieurwissenschaften (ETH)': 16, 
        'Umweltingenieurwissenschaften (ETH)': 16, 'Autonome Agile Drohnen (UZH)': 40, 
        'Biodiversität in Zeit und Raum (UZH)': 24, 'Einblick in aktuelle Forschung am Physik-Institut (UZH)': 20}
uni2 = {'Chemie (ETH)': 25, 'Materialwissenschaft (ETH)': 20, 'Medizin studieren an der ETH': 50, 
        'Pharmazeutische Wissenschaften (ETH)': 24, 'Physik (ETH)': 20, 'Illuminating the Chemistry of Life (UZH)': 30, 
        'Was ist ein Programm? (UZH)': 20}
uni3 = {'Aktuelle Forschung am CERN (UZH)': 30, 'Hyperwürfel (UZH)': 20, 'Krebsforschung (UZH)': 20, 
        'Informatik (ETH)': 16, 'Informationstechnologie und Elektrotechnik (ETH)': 30, 
        'Maschineningenieurwissenschaften (ETH)': 40, 'Mathematik (ETH)': 30}
workshops = {'Oberflächen und Farben': 20, 'Fourier-Reihen': 20, 'Datenexploration': 20, 
             'Astrophysik, Planetologie und Raumfahrt': 20, 'Molekulare Schalter – eine on/off-Beziehung': 12, 
             "Let's Arduino": 12, 'Die Hoffnung ist erneuerbar': 20, "Swiss Young Physicists' Tournament": 20, 
             'Von Big Data und Künstlicher Intelligenz zur Umweltinformatik': 16, 
             'Woher kommst du wirklich? Frag deine DNA!': 20, 'Astronomie': 20, 
             'Entsorgung, Recycling und Abwasserreinigung in Zürich': 16, 
             'Praktische Molekulargenetik: Wolbachia – auf der Suche nach Bakterien-DNA in selbstgefangenen Insekt': 14, 
             'Energiespeicherung – Vom Akku bis zum Pump-Speicherstausee': 20}

blocks = {'companies': companies, 'uni1': uni1, 'uni2': uni2, 'uni3': uni3, 'workshops': workshops}
block_names = [block for block in blocks.keys()]

multipliers = [50, 2, 1]
counter = [0, 0, 0]

priorities_df = pd.read_json('priorities.json')
n_people = len(priorities_df)

choices_df = choices.get_choices(blocks, priorities_df)

print('Generating initial population:')
genes_init = {}

for i in tqdm(range(n_genes)):
    id = f'GID{i:05}'

    gene_df = create_gene.get_gene(blocks, n_people)
    fitness = calculate_fitness.calc_fitness(choices_df, gene_df, blocks, block_names, multipliers)

    genes_init[id] = [gene_df, fitness]

top_ten = select_mating_pool.get_best(genes_init, 10)
print(np.mean([genes_init[g][1] for g in top_ten]))


In [None]:

n_iterations_list = [10]
n_genes_list = [100]

frac_elite = 0.1
frac_lucky = 0.01

for n_genes in n_genes_list:

    genes = dict(list(genes_init.items())[:n_genes])
    
    top_ten = select_mating_pool.get_best(genes, 1)
    initial_score = np.mean([genes[g][1] for g in top_ten])
    print(f'{n_genes} genes, initial score = {initial_score}\n')

    print('Optimising population:\n')

    for n_iterations in n_iterations_list:

        for i in tqdm(range(n_iterations)):
            pool = select_mating_pool.get_mating_pool(genes, frac_elite, frac_lucky)

            offsprings = crossover.get_offspring(pool, choices_df, blocks, n_genes, multipliers)

            genes = offsprings


        top_ten = select_mating_pool.get_best(genes, 1)
        score = np.mean([genes[g][1] for g in top_ten])
        print(f'{n_genes} genes, {n_iterations} iterations, score = {score} ({score/initial_score:.2f}x)')

        best = genes[top_ten[0]][0]

        for block in blocks:
            first = 0
            second = 0
            third = 0

            print(f'\n{block:<12}:', end=' ')

            for level in ['first', 'second', 'third']:

                filter_col = [col for col in choices_df if col.startswith(f'{block}_{level}')]
                number = choices_df[filter_col].isin(best[block]).any(axis=1).sum()
                print(f'{level} = {number}', end=' ')

In [None]:
counts[block]

In [None]:
block = 'companies'

counts = pd.DataFrame(gene_df[block].value_counts().sort_index())
max_counts = pd.DataFrame({block: blocks[block].values()}, index=blocks[block].keys()).sort_index()

counts[block] > max_counts[block]

In [None]:
random_value = rng.random(size=n_people)
rate = 0.02
options = list(blocks['companies'].keys())
block = 'companies'
off_fd = pool['GID00037'][0]
off_fd['companies'] = (np.where(random_value < rate, random.choice(options), off_fd[f'{block}']))

fitness = calculate_fitness.calc_fitness(choices_df, off_fd, blocks, block_names, multipliers)
fitness

In [None]:
off_fd['companies']