# Reinforcement Learning

## Prueba del entorno PongSolitaire en Gym

In [None]:
import pygame
import gym
import gym_pong_solitaire

env = gym.make('PongSolitaire-v0')

reloj = pygame.time.Clock()

a = env.reset()
print(a)
done = False
while not done:
    state, reward, done, info = env.step(env.action_space.sample())
    env.render(mode='stat')
    print(state)
    reloj.tick(60)
env.close()

print(env.observation_space.low)  # x_ball, y_ball, dx, dy, x_paddle
print(env.action_space.n)

## Episodio de entrenamiento

In [None]:
import gym
import gym_pong_solitaire
from agent import Agent

import numpy as np

env = gym.make('PongSolitaire-v0')

agent = Agent(len(env.observation_space.low), env.action_space.n) # Toma decisiones de entrenamiento de la red neuronal resultante

episodes = 10 # Numero de jugadas que se realizan
time = 15_000 # Numero de estados por episodio
replay_size = 32 # Tamaño del batch de replay (recordar experiencias pasadas)

max_score = 0

print("---------- Training start ----------")

for episode in range(episodes):
    current_state = env.reset()
    current_state = np.reshape(current_state, [1,5])

    total_reward = 0
    
    print("-------------------------\n" +
          "Episode " + str(episode) + "/" + str(episodes))
    for t in range(time):
        action = agent.act(current_state)
        next_state, reward, done, _ = env.step(action)

        next_state = np.reshape(next_state, [1,5])

        agent.remember(current_state, action, reward, next_state, done)
        
        if t%16 == 0 or reward > 0:
            print("\tTime " + str(t) + " Episode " + str(episode) + "/" + str(episodes))
        
        current_state = next_state
        total_reward += reward # La puntuación final es el número de veces que se consigue rebotar la pelota
        
        max_score = total_reward if max_score < total_reward else max_score
        
        if done:
            print("\tEpisode " + str(episode) + "/" + str(episodes) + " finished | Score: " + str(total_reward) + " | Max score: " + str(max_score))
            break
    agent.replay(replay_size)

env.close()
print("---------- Training end ----------")

# SERIALIZACIÓN : salvar modelo
agent.model.save('agent_model.h5')


## Testeando la red

### Recuperar la red

In [None]:
from keras.models import load_model

nn_model = load_model('agent_model.h5')
print(nn_model)

### Testear comportamiento de la red en el ambiente

In [None]:

import gym
import gym_pong_solitaire
import numpy as np

env = gym.make('PongSolitaire-v0')

current_state = env.reset()
current_state = np.reshape(current_state, [1,5])

done = False
total_reward = 0

time = 0
while not done:
    prediction = nn_model.predict(current_state)
    action = np.argmax(prediction)
    
    next_state, reward, done, _ = env.step(action)
    env.render()

    next_state = np.reshape(next_state, [1,5])
    
    time += 1
    print("Time: " + str(time) + " | Reward: " + str(reward) + " | Total reward: " + str(total_reward))

    current_state = next_state
    total_reward += reward # La puntuación final es el número de veces que se consigue rebotar la pelota
    
print("Total score: " + str(total_reward))