# Agente con CNN

In [1]:
import import_ipynb
from UtilsDQL import *

importing Jupyter notebook from UtilsDQL.ipynb


In [2]:
class Deep_Q_Learner(object):
    
    def __init__(self, dim_estado, dim_accion, params):
        '''
        self.Q es la funcion Accion-Valor. Este agente representa los Q-valores usando un red neuronal.
        Si la entrada es vector de 1 dimension se usara el perceptron, si tiene 3 se usara la CNN
        '''
        
        self.dim_estado=dim_estado
        self.dim_accion=dim_accion
        self.params=params                          # diccionario de parametros
        self.gamma=self.params['gamma']             # factor de descuento del agente
        self.tasa_apredizaje=self.params['lr']      # tasa de aprendizaje del agente
        self.mejor_recompensa_media=-float('inf')   # recompensas por episodio
        self.mejor_recompensa=-float('inf')
        self.pasos_entrenamiento=0                  # numero de pasos del batch de entrenamiento realizados

        if len(self.dim_estado)==1:                 
            self.DQN=Perceptron
        elif len(self.dim_estado)==3:              
            self.DQN=CNN

        self.Q=self.DQN(dim_estado, dim_accion, device).to(device)
        self.Q.apply(xavier)

        self.Q_optimizador=torch.optim.Adam(self.Q.parameters(), lr=self.tasa_apredizaje)
        
        if self.params['use_target_network']:
            self.Q_objetivo=self.DQN(dim_estado, dim_accion, device).to(device)

        self.politica=self.epsilon_greedy_Q        # se sigue la politica e-greedy
        self.epsilon_max=params['epsilon_max']
        self.epsilon_min=params['epsilon_min']
        self.epsilon_decay=DecayLineal(valor_ini=self.epsilon_max,
                                       valor_final=self.epsilon_min,
                                       pasos_max=self.params['epsilon_decay_final_step'])
        self.num_pasos=0

        self.memoria=Memoria(capacidad=int(self.params['experience_memory_capacity']))  # inicializa la memoria

        
    def accion(self, observacion):
        observacion=np.array(observacion)  # frames de observacion
        observacion=observacion/255.       # normalizacion del frame
        if len(observacion.shape)==3:      # una sola imagen, no el batch
            if observacion.shape[2]<observacion.shape[0]:  # se cambia de dimension a Canal X Alto X Ancho
                observacion=observacion.reshape(observacion.shape[2], observacion.shape[1], observacion.shape[0])
            observacion=np.expand_dims(observacion, 0)  # se expande la dimension para el batch
        return self.politica(observacion)

    
    def epsilon_greedy_Q(self, observacion):
        writer.add_scalar('DQL/epsilon', self.epsilon_decay(self.num_pasos), self.num_pasos)
        self.num_pasos+=1
        if random.random()<self.epsilon_decay(self.num_pasos) and not self.params['test']:
            accion=random.choice([i for i in range(self.dim_accion)])
        else:
            accion=np.argmax(self.Q(observacion).data.to(torch.device('cpu')).numpy())
        return accion

    
    def aprende(self, s, a, r, s_next, done):
        if done:  # fin del episodio
            td_objetivo=recompensa+0.  
        else:
            td_objetivo=r+self.gamma*torch.max(self.Q(s_next))
        td_error=td_objetivo-self.Q(s)[a]
        # actualiza la estimacion de Q
        #self.Q(s)[a]=self.Q(s)[a]+self.tasa_aprendizaje*td_error
        self.Q_optimizador.zero_grad()
        td_error.backward()
        self.Q_optimizador.step()

        
    def aprende_de_experiencia(self, experiencias):
        batch_xp=Experiencia(*zip(*experiencias))
        obs_batch=np.array(batch_xp.obs)/255.  
        accion_batch=np.array(batch_xp.action)
        recompensa_batch=np.array(batch_xp.reward)
      
        if self.params['clip_rewards']:
            recompensa_batch=np.sign(recompensa_batch)
        next_obs_batch=np.array(batch_xp.next_obs)/255.  
        done_batch=np.array(batch_xp.done)

        if self.params['use_target_network']:
            if self.num_pasos%self.params['target_network_update_freq']==0:
                self.Q_objetivo.load_state_dict(self.Q.state_dict())
                
            td_objetivo=recompensa_batch+~done_batch* \
                np.tile(self.gamma, len(next_obs_batch))* \
                self.Q_objetivo(next_obs_batch).max(1)[0].data.cpu().numpy()
        else:
            td_objetivo=recompensa_batch+~done_batch* \
                np.tile(self.gamma, len(next_obs_batch))* \
                self.Q(next_obs_batch).detach().max(1)[0].data.cpu().numpy()

        td_objetivo=torch.from_numpy(td_objetivo).to(device)
        accion_idx=torch.from_numpy(accion_batch).to(device)
        td_error=torch.nn.functional.mse_loss(self.Q(obs_batch).gather(1, accion_idx.view(-1, 1)),
                                              td_objetivo.float().unsqueeze(1))

        self.Q_optimizador.zero_grad()
        td_error.mean().backward()
        writer.add_scalar('DQL/td_error', td_error.mean(), self.num_pasos)
        self.Q_optimizador.step()

        
    def replay_experiencia(self, batch_size=None):
        batch_size=batch_size if batch_size is not None else self.params['replay_batch_size']
        experiencia_batch=self.memoria.sample(batch_size)
        self.aprende_de_experiencia(experiencia_batch)
        self.pasos_entrenamiento+=1  

        
    def guardar(self, nombre_ent):
        archivo=self.params['save_dir']+'DQL_'+nombre_ent+'.ptm'
        estado_agente={'Q': self.Q.state_dict(),
                       'best_mean_reward': self.mejor_recompensa_media,
                       'best_reward': self.recompensa_media};
        torch.save(estado_agente, archivo)
        print('Estado del agente guardado en ', archivo)

        
    def cargar(self, nombre_ent):
        archivo=self.params['load_dir']+'DQL_'+nombre_ent+'.ptm'
        estado_agente=torch.load(archivo, map_location= lambda x, loc: x)
        
        self.Q.load_state_dict(estado_agente['Q'])
        self.Q.to(device)
        self.mejor_recompensa_media=estado_agente['best_mean_reward']
        self.recompensa_media=estado_agente['best_reward']
        
        print('Cargado el estado del Q modelo desde', archivo,
              ' con un mejor recompensa media de:', self.mejor_recompensa_media,
              ' y una mejor recompensa de :', self.recompensa_media)



### Ejecutando el agente...

Hay que pasarlo a un archivo .py para ejecucion.

In [None]:
if __name__ == '__main__':
    
    ent_conf=params_manager.param_entorno()
    ent_conf['env_name']=args.env
    
    # In test mode, let the end of the game be the end of episode 
    # rather than ending episode at the end of every life.
    # This helps to report out the (mean and max) episode rewards per game (rather than per life!)
    if args.test:
        env_conf["episodic_life"] = False
        
    # Specify the reward calculation type used for printing stats at the end of every episode.
    # If "episode_life" is true, the printed stats (reward, mean reward, max reward) are per life. If "episodic_life"
    # is false, the printed stats/scores are per game in Atari environments
    rew_type = "LIFE" if env_conf["episodic_life"] else "GAME"

    # If a custom useful_region configuration for this environment ID is available, use it if not use the Default
    custom_region_available = False
    for key, value in env_conf['useful_region'].items():
        if key in args.env:
            env_conf['useful_region'] = value
            custom_region_available = True
            break
    if custom_region_available is not True:
        env_conf['useful_region'] = env_conf['useful_region']['Default']

    print("Using env_conf:", env_conf)
    atari_env = False
    for game in Atari.get_games_list():
        if game.replace("_", "") in args.env.lower():
            atari_env = True
    if atari_env:
        env = Atari.make_env(args.env, env_conf)
    else:
        print("Given environment name is not an Atari Env. Creating a Gym env")
        # Resize the obs to w x h (84 x 84 by default) and then reshape it to be in the C x H x W format
        env = env_utils.ResizeReshapeFrames(gym.make(args.env))

    if args.record:  # If monitor is enabled, record stats and video of agent's performance
        env = gym.wrappers.Monitor(env, args.recording_output_dir, force=True)

    observation_shape = env.observation_space.shape
    action_shape = env.action_space.n
    agent_params = params_manager.get_agent_params()
    agent_params["test"] = args.test
    agent = Deep_Q_Learner(observation_shape, action_shape, agent_params)

    episode_rewards = list()
    prev_checkpoint_mean_ep_rew = agent.best_mean_reward
    num_improved_episodes_before_checkpoint = 0  # To keep track of the num of ep with higher perf to save model
    print("Using agent_params:", agent_params)
    if agent_params['load_trained_model']:
        try:
            agent.load(env_conf["env_name"])
            prev_checkpoint_mean_ep_rew = agent.best_mean_reward
        except FileNotFoundError:
            print("WARNING: No trained model found for this environment. Training from scratch.")

    #for episode in range(agent_params['max_num_episodes']):
    episode = 0
    while global_step_num <= agent_params['max_training_steps']:
        obs = env.reset()
        cum_reward = 0.0  # Cumulative reward
        done = False
        step = 0
        #for step in range(agent_params['max_steps_per_episode']):
        while not done:
            if env_conf['render'] or args.render:
                env.render()
            action = agent.get_action(obs)
            next_obs, reward, done, info = env.step(action)
            #agent.learn(obs, action, reward, next_obs, done)
            agent.memory.store(Experience(obs, action, reward, next_obs, done))

            obs = next_obs
            cum_reward += reward
            step += 1
            global_step_num +=1

            if done is True:
                episode += 1
                episode_rewards.append(cum_reward)
                if cum_reward > agent.best_reward:
                    agent.best_reward = cum_reward
                if np.mean(episode_rewards) > prev_checkpoint_mean_ep_rew:
                    num_improved_episodes_before_checkpoint += 1
                if num_improved_episodes_before_checkpoint >= agent_params["save_freq_when_perf_improves"]:
                    prev_checkpoint_mean_ep_rew = np.mean(episode_rewards)
                    agent.best_mean_reward = np.mean(episode_rewards)
                    agent.save(env_conf['env_name'])
                    num_improved_episodes_before_checkpoint = 0
                print("\nEpisode#{} ended in {} steps. Per {} stats: reward ={} ; mean_reward={:.3f} best_reward={}".
                      format(episode, step+1, rew_type, cum_reward, np.mean(episode_rewards), agent.best_reward))
                writer.add_scalar("main/ep_reward", cum_reward, global_step_num)
                writer.add_scalar("main/mean_ep_reward", np.mean(episode_rewards), global_step_num)
                writer.add_scalar("main/max_ep_rew", agent.best_reward, global_step_num)
                # Learn from batches of experience once a certain amount of xp is available unless in test only mode
                if agent.memory.get_size() >= 2 * agent_params['replay_start_size'] and not args.test:
                    agent.replay_experience()

                break
    env.close()
    writer.close()
