<a href="https://colab.research.google.com/github/aderaustmit/air_hockey_challenge/blob/warm-up/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("test")

import torch
print(torch.cuda.is_available())

test
True


In [1]:
import torch.nn as nn
from air_hockey_challenge.framework.evaluate_agent import PENALTY_POINTS
from air_hockey_challenge.utils.kinematics import forward_kinematics
import numpy as np

class Network(nn.Module):
    def __init__(self, input_shape, output_shape, n_features, **kwargs):
        super(Network, self).__init__()

        self._h1 = nn.Linear(input_shape[0], n_features)
        self._h2 = nn.Linear(n_features, n_features)
        self._h3 = nn.Linear(n_features, output_shape[0])

        nn.init.xavier_uniform_(self._h1.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h2.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h3.weight,
                                gain=nn.init.calculate_gain('linear'))

    def forward(self, obs, **kwargs):
        features1 = torch.tanh(self._h1(torch.squeeze(obs, 1).float()))
        features2 = torch.tanh(self._h2(features1))
        return self._h3(features2)
    
def custom_reward_function(base_env, state, action, next_state, absorbing):
    reward = -0.1 # small value to disincentivate long runs: faster is better

    q = state[base_env.env_info['joint_pos_ids']]
    dq = state[base_env.env_info['joint_vel_ids']]

    # Get a dictionary of the constraint functions {"constraint_name": ndarray}
    c = base_env.env_info['constraints'].fun(q, dq)

    # for k,v in c.items():
    #     reward -= PENALTY_POINTS[k] * np.sum(v > 0)
    # mybe not ideal: penalty is flat outside, does not know how to improve --> use violation instead of points?

    for k,val in c.items():
        for v in val:
            reward -= 100*v if v > 0 else 0

    # TODO reward for good actions --> score
    # for now --> try to go to the puck
    
    ee_pos = forward_kinematics(base_env.env_info['robot']['robot_model'], base_env.env_info['robot']['robot_data'], q)[0]
    ee_pos = ee_pos - np.array([1.51,0,0.1])
    puck_pos = state[base_env.env_info['puck_pos_ids']]
    puck_vel = np.linalg.norm(state[base_env.env_info['puck_vel_ids']])
    print(puck_vel)
    reward += -np.linalg.norm(ee_pos - puck_pos) if puck_vel < 0.1 else puck_vel

    return reward

from mushroom_rl.algorithms.actor_critic import PPO

class PPOAgent(PPO): # Just to fix the action shape
    def draw_action(self, state):
        return super().draw_action(state).reshape(2,3)
    
    def fit(self, dataset, **info):
        for i in range(len(dataset)):
            dataset[i] = list(dataset[i])
            dataset[i][1] = dataset[i][1].flatten()
        return super().fit(dataset, **info)

In [3]:
from air_hockey_challenge.framework.air_hockey_challenge_wrapper import AirHockeyChallengeWrapper
from air_hockey_challenge.framework.challenge_core import ChallengeCore


from mushroom_rl.core import Logger
from mushroom_rl.algorithms.actor_critic import PPO
from mushroom_rl.policy import GaussianTorchPolicy

from mushroom_rl.utils.dataset import compute_metrics

import torch
import numpy as np
from tqdm import tqdm

use_cuda = torch.cuda.is_available()
# use_cuda = False

# logger = Logger(PPO.__name__, results_dir=None)
# logger.strong_line()
# logger.info('Experiment Algorithm: ' + PPO.__name__)

mdp = AirHockeyChallengeWrapper("3dof-hit", custom_reward_function=custom_reward_function)
mdp.reset()

critic_params = dict(network=Network,
                        optimizer={'class': torch.optim.Adam,
                                'params': {}},
                        loss=torch.nn.functional.mse_loss,
                        n_features=64,
                        batch_size=64,
                        input_shape=(12,),
                        output_shape=(1,), 
                        use_cuda = use_cuda)

alg_params = dict(actor_optimizer={'class': torch.optim.Adam,
                                    'params': {}},
                    critic_params=critic_params,
                    n_epochs_policy = 5,
                    batch_size = 32,
                    eps_ppo = 0.01,
                    lam = 0.99,
                 )

policy_params = dict(
    std_0=0.01,
    n_features=64,
    use_cuda=use_cuda
)

policy = GaussianTorchPolicy(Network,
                                (12,),
                                (6,),
                                **policy_params)

policy.load("logs/bc_ppo/bc/policy.pkl")

agent = PPOAgent(mdp.info, policy, **alg_params)

# Algorithm
core = ChallengeCore(agent, mdp)

log_metrics = []

def train(n_epochs, n_episode_learn, n_episode_eval, n_episode_per_fit):
    dataset = core.evaluate(n_episodes=n_episode_eval, render=False)
    metrics = compute_metrics(dataset)
    # logger.epoch_info(0, metrics=metrics)
    # logger.info('Tranining start')
    print("Epoch:",0,"metrics:",metrics)
    print("Training start")
    
    for n in tqdm(range(n_epochs)):
        core.learn(n_episodes=n_episode_learn, n_episodes_per_fit=n_episode_per_fit)
        dataset = core.evaluate(n_episodes=n_episode_eval, render=False)
        metrics = compute_metrics(dataset)
        # logger.epoch_info(n+1, metrics=metrics)
        print("Epoch:",n+1,"metrics:",metrics)
        log_metrics.append(metrics)
        agent.save(f"dataset/PPO_BC/new_epoch_{n+1}.pkl")

    # logger.info('Tranining end')
    print("Training end")


In [None]:
train(n_epochs=25, n_episode_learn=100, n_episode_eval=5, n_episode_per_fit=1)

In [None]:
mdp.env_info["robot"]["base_frame"]

In [6]:
dataset = core.evaluate(n_episodes=5, render=True)

                                             

: 

In [5]:
from air_hockey_challenge.framework.evaluate_agent import evaluate
from mushroom_rl.policy import GaussianTorchPolicy

policy = GaussianTorchPolicy(Network,
                                (12,),
                                (6,),
                                **policy_params)

policy.load("dataset/PPO_BC/new_epoch_25.pkl")

def build_agent(env_info, **kwargs):
    """
    Function where an Agent that controls the environments should be returned.
    The Agent should inherit from the mushroom_rl Agent base env.

    :param env_info: The environment information
    :return: Either Agent ot Policy
    """
    if "hit" in env_info["env_name"]:
        policy = GaussianTorchPolicy(Network,
                                (12,),
                                (6,),
                                **policy_params)
        policy.load("dataset/PPO2/epoch_10.pkl")

        agent = PPOAgent(mdp.info, policy, **alg_params)
        return agent

eval = evaluate(build_agent, 'logs/ppo_agent', env_list=["3dof-hit"], n_episodes=50, render=False, quiet=False, n_cores=1)

                                               

Environment:        3dof-hit
Number of Episodes: 50
Success:            0.0200
Penalty:            226
Number of Violations: 
  joint_vel_constr  50
  ee_constr         42
  Jerk              50
  Total             142
-------------------------------------------------

