# Agente con CNN

In [8]:
import import_ipynb
from UtilsDQL import *

In [None]:
class Deep_Q_Learner(object):
    
    def __init__(self, dim_estado, dim_accion, params):
        """
        self.Q is the Action-Value function. This agent represents Q using a Neural Network
        If the input is a single dimensional vector, uses a Single-Layer-Perceptron else if the input is 3 dimensional
        image, use a Convolutional-Neural-Network

        :param state_shape: Shape (tuple) of the observation/state
        :param action_shape: Shape (number) of the discrete action space
        :param params: A dictionary containing various Agent configuration parameters and hyper-parameters
        """
        self.dim_estado=dim_estado
        self.dim_accion=dim_accion
        self.params=params
        self.gamma=self.params['gamma']             # Agent's discount factor
        self.tasa_apredizaje=self.params['lr']      # Agent's Q-learning rate
        self.mejor_recompensa_media=-float('inf')   # Agent's personal best mean episode reward
        self.mejor_recompensa=-float('inf')
        self.pasos_entrenamiento=0                  # Number of training batch steps completed so far

        if len(self.dim_estado)==1:                 # Single dimensional observation/state space
            self.DQN=Perceptron
        elif len(self.dim_estado)==3:               # 3D/image observation/state
            self.DQN=CNN

        self.Q=self.DQN(dim_estado, dim_accion, device).to(device)
        self.Q.apply(xavier)

        self.Q_optimizador=torch.optim.Adam(self.Q.parameters(), lr=self.tasa_apredizaje)
        
        if self.params['use_target_network']:
            self.Q_objetivo=self.DQN(dim_estado, dim_accion, device).to(device)
        # self.policy is the policy followed by the agent. This agents follows
        # an epsilon-greedy policy w.r.t it's Q estimate.
        self.politica=self.epsilon_greedy_Q
        self.epsilon_max=params['epsilon_max']
        self.epsilon_min=params['epsilon_min']
        self.epsilon_decay=DecayLineal(valor_ini=self.epsilon_max,
                                       valor_final=self.epsilon_min,
                                       pasos_max=self.params['epsilon_decay_final_step'])
        self.num_pasos=0

        self.memoria=Memoria(capacidad=int(self.params['experience_memory_capacity']))  # Initialize an Experience memory with 1M capacity

    def accion(self, observacion):
        observacion=np.array(observacion)  # Observations could be lazy frames. So force fetch before moving forward
        observacion=observacion/255.       # Scale/Divide by max limit of obs' dtype. 255 for uint8
        if len(observacion.shape)==3:      # Single image (not a batch)
            if observacion.shape[2]<observacion.shape[0]:  # Probably observation is in W x H x C format
                # NOTE: This is just an additional check. The env wrappers are taking care of this conversion already
                # Reshape to C x H x W format as per PyTorch's convention
                observacion=observacion.reshape(observacion.shape[2], observacion.shape[1], observacion.shape[0])
            observacion=np.expand_dims(observacion, 0)  # Create a batch dimension
        return self.politica(observacion)

    def epsilon_greedy_Q(self, observacion):
        # Decay Epsilon/exploration as per schedule
        writer.add_scalar('DQL/epsilon', self.epsilon_decay(self.num_pasos), self.num_pasos)
        self.num_pasos+=1
        if random.random()<self.epsilon_decay(self.num_pasos) and not self.params['test']:
            accion=random.choice([i for i in range(self.dim_accion)])
        else:
            accion=np.argmax(self.Q(observacion).data.to(torch.device('cpu')).numpy())
        return accion

    def aprende(self, s, a, r, s_next, done):
        # TD(0) Q-learning
        if done:  # End of episode
            td_objetivo=recompensa+0.  # Set the value of terminal state to zero
        else:
            td_objetivo=r+self.gamma*torch.max(self.Q(s_next))
        td_error=td_objetivo-self.Q(s)[a]
        # Update Q estimate
        #self.Q(s)[a]=self.Q(s)[a]+self.tasa_aprendizaje*td_error
        self.Q_optimizador.zero_grad()
        td_error.backward()
        self.Q_optimizador.step()

    def aprende_de_experiencia(self, experiencias):
        batch_xp=Experiencia(*zip(*experiencias))
        obs_batch=np.array(batch_xp.obs)/255.  # Scale/Divide by max limit of obs's dtype. 255 for uint8
        accion_batch=np.array(batch_xp.action)
        recompensa_batch=np.array(batch_xp.reward)
        # Clip the rewards
        if self.params['clip_rewards']:
            recompensa_batch=np.sign(recompensa_batch)
        next_obs_batch=np.array(batch_xp.next_obs)/255.  # Scale/Divide by max limit of obs' dtype. 255 for uint8
        done_batch=np.array(batch_xp.done)

        if self.params['use_target_network']:
            #if self.training_steps_completed % self.params['target_network_update_freq'] == 0:
            if self.num_pasos%self.params['target_network_update_freq']==0:
                # The *update_freq is the Num steps after which target net is updated.
                # A schedule can be used instead to vary the update freq.
                self.Q_objetivo.load_state_dict(self.Q.state_dict())
            td_objetivo=recompensa_batch+~done_batch* \
                np.tile(self.gamma, len(next_obs_batch))* \
                self.Q_objetivo(next_obs_batch).max(1)[0].data.cpu().numpy()
        else:
            td_objetivo=recompensa_batch+~done_batch* \
                np.tile(self.gamma, len(next_obs_batch))* \
                self.Q(next_obs_batch).detach().max(1)[0].data.cpu().numpy()

        td_objetivo=torch.from_numpy(td_objetivo).to(device)
        accion_idx=torch.from_numpy(accion_batch).to(device)
        td_error=torch.nn.functional.mse_loss(self.Q(obs_batch).gather(1, action_idx.view(-1, 1)),
                                              td_objetivo.float().unsqueeze(1))

        self.Q_optimizador.zero_grad()
        td_error.mean().backward()
        writer.add_scalar('DQL/td_error', td_error.mean(), self.num_pasos)
        self.Q_optimizador.step()

        
    def replay_experiencia(self, batch_size=None):
        batch_size=batch_size if batch_size is not None else self.params['replay_batch_size']
        experiencia_batch=self.memoria.sample(batch_size)
        self.aprende_de_experiencia(experiencia_batch)
        self.pasos_entrenamiento+=1  # Increment the number of training batch steps complemented

        
    def guardar(self, nombre_ent):
        archivo=self.params['save_dir']+'DQL_'+nombre_ent+'.ptm'
        estado_agente={'Q': self.Q.state_dict(),
                       'best_mean_reward': self.mejor_recompensa_media,
                       'best_reward': self.recompensa_media};
        torch.save(estado_agente, archivo)
        print('Estado del agente guardado en ', archivo)

        
    def cargar(self, nombre_ent):
        archivo=self.params['load_dir']+'DQL_'+nombre_ent+'.ptm'
        estado_agente=torch.load(archivo, map_location= lambda x, loc: x)
        
        self.Q.load_state_dict(estado_agente['Q'])
        self.Q.to(device)
        self.mejor_recompensa_media=estado_agente['best_mean_reward']
        self.recompensa_media=estado_agente['best_reward']
        
        print('Caragdo el estado del Q modelo desde', archivo,
              ' con un mejor recompensa media de:', self.mejor_recompensa_media,
              ' y una mejor recompensa de :', self.recompensa_media)

