# Este entrenamiento prueba el algoritmo vpg

### Instalar mujoco

In [None]:
#Include this at the top of your colab code
import os
if not os.path.exists('.mujoco_setup_complete'):
  # Get the prereqs
  !apt-get -qq update
  !apt-get -qq install -y libosmesa6-dev libgl1-mesa-glx libglfw3 libgl1-mesa-dev libglew-dev patchelf
  # Get Mujoco
  !mkdir ~/.mujoco
  !wget -q https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz
  !tar -zxf mujoco.tar.gz -C "$HOME/.mujoco"
  !rm mujoco.tar.gz
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  !echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/.mujoco/mujoco210/bin' >> ~/.bashrc 
  !echo 'export LD_PRELOAD=$LD_PRELOAD:/usr/lib/x86_64-linux-gnu/libGLEW.so' >> ~/.bashrc 
  # THE ANNOYING ONE, FORCE IT INTO LDCONFIG SO WE ACTUALLY GET ACCESS TO IT THIS SESSION
  !echo "/root/.mujoco/mujoco210/bin" > /etc/ld.so.conf.d/mujoco_ld_lib_path.conf
  !ldconfig
  # Install Mujoco-py
  !pip3 install -U 'mujoco-py<2.2,>=2.1'
  # run once
  !touch .mujoco_setup_complete

try:
  if _mujoco_run_once:
    pass
except NameError:
  _mujoco_run_once = False
if not _mujoco_run_once:
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  try:
    os.environ['LD_LIBRARY_PATH']=os.environ['LD_LIBRARY_PATH'] + ':/root/.mujoco/mujoco210/bin'
  except KeyError:
    os.environ['LD_LIBRARY_PATH']='/root/.mujoco/mujoco210/bin'
  try:
    os.environ['LD_PRELOAD']=os.environ['LD_PRELOAD'] + ':/usr/lib/x86_64-linux-gnu/libGLEW.so'
  except KeyError:
    os.environ['LD_PRELOAD']='/usr/lib/x86_64-linux-gnu/libGLEW.so'
  # presetup so we don't see output on first env initialization
  import mujoco_py
  _mujoco_run_once = True


### instalar librerias

In [None]:
#clonar TEG 
#!rm -r TEG/ esta funcion se usa en caso de tener que remover el proyecto
!git clone https://github.com/Alexfm101/TEG.git
!pip3 install -e TEG

In [None]:
#clonar spinning up
!sudo apt-get update && sudo apt-get install libopenmpi-dev
!git clone https://github.com/openai/spinningup.git
!pip3 install -e spinningup

### configurar librerias para servidor(opcional)

In [None]:
# IMPORTAR LA LIBRERIA AL SYS
import sys
sys.path.append('TEG')

### importar  librerias (probar desde aqui)

In [None]:
import gym
from RobotEnv.envs.UR5_Env import UR5_EnvTest #acuerdate de agregar TEG para colab
from spinningup.spinup import vpg_pytorch as vpg
import numpy as np
import torch
import  torch.nn as nn
from torch.optim import Adam
from torch.distributions.normal import Normal

### configurar entrenamiento



In [None]:
# crear actor y critico

#perceptron multicapa MPL
def mlp(sizes, activation, output_activation=nn.Identity):
    layers = []
    for j in range(len(sizes)-1):
        action = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), action()]

    return nn.Sequential(*layers)

# politica de control
class Policy(nn.Module):
    """ implementación de la politica inicial para acciones continuas"""
    def __init__(self, obs_dim, act_dim, hidden_sizes, activation):
        super().__init__()
        log_std = -0.5 * np.ones(act_dim, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
        self.mu_net = mlp([obs_dim] + list(hidden_sizes) + [act_dim], activation)

    def distribution(self, obs):
        """ distribución gaussiana diagonal """
        mu = self.mu_net(obs)
        std = torch.exp(self.log_std)
        return Normal(mu, std)

    def log_prob_from_distribution(self, pi, action):
        return pi.loq_prob(action).sum(axis=-1) # explicar despues

    def foward(self, obs,  action=None):
        """
        produce una distribución de acciones para una observación dada y
        opcionalmente computa el log likelihood para una acción dada bajo esas distribuciónes
        """
        pi = self.distribution(obs)
        logp_a = None
        if action is not None:
            logp_a = self.log_prob_from_distribution(pi, action)

        return pi, logp_a

# función valor
class Value_function(nn.Module):

    def __init__(self, obs_dim,hidden_sizes, activation):
        super().__init__()
        self.value_network = mlp([obs_dim] + list(hidden_sizes) + [1], activation)

        def forward(self, obs):
            return torch.squeeze(self.value_network(obs), -1) # para asegurar que v tiene la forma correct

# actor critico
class ActorCritic(nn.Module):
    
    def __init__(self, observation_space, action_space, 
                 hidden_sizes=(64,64), activation=nn.Tanh):
        super().__init__()
        
        obs_dim = observation_space.shape[0]
        act_dim = action_space.shape[0]

        # crear actor o politica
        self.pi = Policy(obs_dim, act_dim, (64,64), activation=nn.Tanh)
    
        # crear critico o función valor
        self.v = Value_function(obs_dim, hidden_sizes=(64,64), activation=nn.Tanh)

    def step(self, obs):
        with torch.no_grad():
            pi = self.pi.distribution(obs)
            a = pi.sample()
            logp_a = self.pi.log_prob_from_distribution(pi, a)
            v = self.v(obs)
        return a.numpy(), v.numpy(), logp_a.numpy()

    def act(self, obs):
        return self.step(obs)[0]


In [None]:
# entrenar
env = UR5_EnvTest(simulation_frames=10, torque_control= 0.01, distance_threshold=0.5, Gui=False)

ac = ActorCritic(env.observation_space, env.action_space)

vpg(env, actor_critic=ac, ac_kwargs=dict(), seed=0, steps_per_epoch=2000, 
    epochs=50, gamma=0.99, pi_lr=3e-4,vf_lr=1e-3, train_v_iters=80, 
    lam=0.97, max_ep_len=1000,logger_kwargs=dict(), save_freq=10)