In [138]:
import operator
import gym
import random
# from NN import NN 
from itertools import permutations
import numpy as np

In [139]:
env = gym.make('CartPole-v1')

In [140]:
class NN:
    def __init__(self, w):
        self.W = w

    @classmethod
    def from_params(cls, input_nodes, hidden_layers, output_nodes):  # NN.from_params(3,[3,4,5],1)
        w = []
        if len(hidden_layers):
            w.append(np.random.normal(0, 0.1, (input_nodes + 1, hidden_layers[0])))
            for i in range(1, len(hidden_layers)):
                w.append(np.random.normal(0, 0.1, (hidden_layers[i - 1] + 1, hidden_layers[i])))
            w.append(np.random.normal(0, 0.1, (hidden_layers[-1] + 1, output_nodes)))
        else:
            w.append(np.random.normal(0, 0.1, (input_nodes + 1, output_nodes)))
        return cls(w)

    @classmethod
    def from_weights(cls, w):
        return cls(w)

    @classmethod
    def crossover(cls, dna_1, dna_2):
        new_w = []
        for m in range(len(dna_1)):
            new_m = np.zeros(dna_1[m].shape)
            for i in range(len(dna_1[m])):
                for j in range(len(dna_1[m][i])):
                    new_m[i][j] = dna_1[m][i][j] if random.random() > 0.5 else dna_2[m][i][j]
            new_w.append(new_m)
        return new_w

    @classmethod
    def mutate(cls, dna, rate):
        _w = []
        for m in dna:
            new_m = np.copy(m)
            for w in np.nditer(new_m, op_flags=['readwrite']):
                if random.random() < rate:
                    w[...] = random.random()
            _w.append(new_m)
        return _w

    def hidden_activation(self, Z):
        return np.maximum(Z, 0)

    def softmax_activation(self, Z):
        exp = np.exp(Z - Z.max())
        return np.array(exp / exp.sum())

    def predict(self, inputs):
        a = np.append(np.array(inputs), 1)
        for i in range(len(self.W) - 1):
            Z = np.squeeze(a @ self.W[i])
            a = np.append(self.hidden_activation(Z), 1)
        Z = np.squeeze(a @ self.W[-1])
        a = self.softmax_activation(Z)
        return np.argmax(a)

    def get_weights_copy(self):
        _w = []
        for w in self.W:
            _w.append(np.copy(w))
        return _w

In [141]:
class Agent:
    def __init__(self, env, w = None):
        self.action_size = env.action_space.n
        self.observation_size = env.observation_space.shape[0]
        self.fitness = 0
        if w:
            self.NN = NN.from_weights(w)
        else:
            self.NN = NN.from_params(self.observation_size, [2], self.action_size)


    def get_action(self, observation):
        return self.NN.predict(observation)
    
    def set_fitness(self, fitness):
        self.fitness = fitness
        
    def get_mutated_copy(self, rate):
        return NN.mutate(self.NN.get_weights_copy(), rate)
    
    def get_dna(self):
        return self.NN.get_weights_copy()
        

In [142]:
algo_recherche = True
mutation_rate = 0.01
total_run = 15
agent_quantity = 225 # un nombre carré
total_generation = 10

# Création de la première génération
agents = [Agent(env) for i in range(agent_quantity)]

for generation in range(total_generation):
    print("Generation: " + str(generation) + " ----------------------------------------")
    print("Testing " + str(len(agents)) + " agents...")
    
    # Test de chaque agents    
    for agent in agents:
        fitness = 0
        # Chaque agent fait plusieurs essait
        for run in range(total_run):
            state = env.reset()
            for t in range(1000):
                action = agent.get_action(state)
                state, reward, done, info = env.step(action)
                fitness += reward
                if done:
                    break
        agent.set_fitness(fitness/total_run)
        

    best_candidate = max(agents, key=operator.attrgetter("fitness"))
    print("Best candidate fitness: " + str(int(best_candidate.fitness)))
    if best_candidate.fitness == 500.0:
        print("Early stopping")
        best_agent = best_candidate
        break
    
    # Nouvelle génération
    print("Creating new generation")
    print("Parents:")
    parents = []
    new_agents = []
    
    # Selection des parents par l'algorithme TOS
    parent_quantity = int(agent_quantity ** 0.5)
    if algo_recherche:
        tournament_size = parent_quantity
        for i in range(parent_quantity):
            selected_candidates = random.sample(agents, tournament_size)
            champion = max(selected_candidates, key=operator.attrgetter("fitness"))
            parents.append(champion)
            # Ajout du champion dans les prochains agents
            new_agents.append(champion)
            print(int(champion.fitness))
        
        # Chaque permutation de parent cré un enfant et est muté
        for i, j in permutations(parents, 2):
            child_dna = NN.crossover(i.get_dna(), j.get_dna())
            child = Agent(env, NN.mutate(child_dna, mutation_rate))
            new_agents.append(child)
    else:
        agents.sort(key=operator.attrgetter("fitness"), reverse=True)
        for agent in agents[:parent_quantity]:
            new_agents.append(agent)
            parents.append(agent)
            print(agent.fitness)
    
        for parent in parents:
            for i in range(agent_quantity//parent_quantity-1):
                new_agents.append(Agent(env, NN.mutate(parent.get_dna(), mutation_rate)))

    
    agents = new_agents
    print()
    
print("End training")
best_agent = max(agents, key=operator.attrgetter("fitness"))

Generation: 0 ----------------------------------------
Testing 225 agents...
Best candidate's fitness: 27
Creating new generation
Parents:
12
11
9
9
12
9
10
19
11
20
24
12
19
9
27

Generation: 1 ----------------------------------------
Testing 225 agents...
Best candidate's fitness: 172
Creating new generation
Parents:
172
92
42
172
23
57
26
129
66
20
23
129
57
129
70

Generation: 2 ----------------------------------------
Testing 225 agents...
Best candidate's fitness: 298
Creating new generation
Parents:
63
298
171
74
150
94
171
134
93
119
171
171
130
122
164

Generation: 3 ----------------------------------------
Testing 225 agents...
Best candidate's fitness: 412
Creating new generation
Parents:
206
163
177
275
172
239
277
284
412
277
284
177
159
156
284

Generation: 4 ----------------------------------------
Testing 225 agents...
Best candidate's fitness: 473
Creating new generation
Parents:
365
266
322
274
274
373
332
314
365
314
350
365
252
282
473

Generation: 5 ---------------

In [143]:
evaluation_score = 0
evaluation_runs = 100
for run in range(evaluation_runs):
    fitness = 0
    state = env.reset()
    for t in range(1000):
        action = best_agent.get_action(state)
        state, reward, done, info = env.step(action)
        fitness += reward
        # env.render()
        if done:
            # print(t)
            evaluation_score+=fitness
            break
print(evaluation_score/evaluation_runs)


434.58


In [144]:
state = env.reset()
for t in range(1000):
    action = best_agent.get_action(state)
    state, reward, done, info = env.step(action)
    env.render()
    if done:
        print(t)
        break

499


In [145]:
env.close()