<a href="https://colab.research.google.com/github/aderaustmit/air_hockey_challenge/blob/warm-up/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("test")

import torch
print(torch.cuda.is_available())

test
True


In [2]:
import torch.nn as nn

class Network(nn.Module):
    def __init__(self, input_shape, output_shape, n_features, **kwargs):
        super(Network, self).__init__()

        self._h1 = nn.Linear(input_shape[0], n_features)
        self._h2 = nn.Linear(n_features, n_features)
        self._h3 = nn.Linear(n_features, output_shape[0])

        nn.init.xavier_uniform_(self._h1.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h2.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h3.weight,
                                gain=nn.init.calculate_gain('linear'))

    def forward(self, obs, **kwargs):
        features1 = torch.tanh(self._h1(torch.squeeze(obs, 1).float()))
        features2 = torch.tanh(self._h2(features1))
        return self._h3(features2)

In [3]:
from air_hockey_challenge.framework.evaluate_agent import PENALTY_POINTS
from air_hockey_challenge.utils.kinematics import forward_kinematics
import numpy as np


def custom_reward_function(base_env, state, action, next_state, absorbing):
    reward = -0.1 # small value to disincentivate long runs: faster is better

    q = state[base_env.env_info['joint_pos_ids']]
    dq = state[base_env.env_info['joint_vel_ids']]

    # Get a dictionary of the constraint functions {"constraint_name": ndarray}
    c = base_env.env_info['constraints'].fun(q, dq)

    # for k,v in c.items():
    #     reward -= PENALTY_POINTS[k] * np.sum(v > 0)
    # mybe not ideal: penalty is flat outside, does not know how to improve --> use violation instead of points?

    for k,val in c.items():
        for v in val:
            reward -= v if v > 0 else 0

    # TODO reward for good actions --> score
    # for now --> try to go to the puck
    
    ee_pos = forward_kinematics(base_env.env_info['robot']['robot_model'], base_env.env_info['robot']['robot_data'], q)[0]
    puck_pos = base_env.env_info['puck_pos_ids']
    puck_vel = np.linalg.norm(state[base_env.env_info['puck_vel_ids']])
    reward += -np.linalg.norm(ee_pos - puck_pos) if puck_vel < 0.1 else puck_vel

    return reward

In [4]:
from mushroom_rl.algorithms.actor_critic import PPO

class PPOAgent(PPO): # Just to fix the action shape
    def draw_action(self, state):
        return super().draw_action(state).reshape(2,3)
    
    def fit(self, dataset, **info):
        for i in range(len(dataset)):
            dataset[i] = list(dataset[i])
            dataset[i][1] = dataset[i][1].flatten()
        return super().fit(dataset, **info)

In [5]:
from air_hockey_challenge.framework.air_hockey_challenge_wrapper import AirHockeyChallengeWrapper
from air_hockey_challenge.framework.challenge_core import ChallengeCore


from mushroom_rl.core import Logger
from mushroom_rl.algorithms.actor_critic import PPO
from mushroom_rl.policy import GaussianTorchPolicy

from mushroom_rl.utils.dataset import compute_metrics

import torch
import numpy as np
from tqdm import tqdm

cuda = torch.cuda.is_available()

# logger = Logger(PPO.__name__, results_dir=None)
# logger.strong_line()
# logger.info('Experiment Algorithm: ' + PPO.__name__)

mdp = AirHockeyChallengeWrapper("3dof-hit", custom_reward_function=custom_reward_function)
mdp.reset()

critic_params = dict(network=Network,
                        optimizer={'class': torch.optim.Adam,
                                'params': {}},
                        loss=torch.nn.functional.mse_loss,
                        n_features=64,
                        batch_size=64,
                        input_shape=(12,),
                        output_shape=(1,), 
                        use_cuda = cuda)

alg_params = dict(actor_optimizer={'class': torch.optim.Adam,
                                    'params': {}},
                    critic_params=critic_params,
                    n_epochs_policy = 1,
                    batch_size = 5,
                    eps_ppo = 0.01,
                    lam = 0.95,
                 )

policy_params = dict(
    std_0=1.,
    n_features=64,
    use_cuda=cuda
)

policy = GaussianTorchPolicy(Network,
                                (12,),
                                (6,),
                                **policy_params)
# policy.load("dataset/policy")

agent = PPOAgent(mdp.info, policy, **alg_params)

# Algorithm
core = ChallengeCore(agent, mdp)

def train(n_epochs, n_episode_learn, n_episode_eval, n_episode_per_fit):
    dataset = core.evaluate(n_episodes=n_episode_eval, render=False)
    metrics = compute_metrics(dataset)
    # logger.epoch_info(0, metrics=metrics)
    # logger.info('Tranining start')
    print("Epoch:",0,"metrics:",metrics)
    print("Training start")
    
    for n in tqdm(range(n_epochs)):
        core.learn(n_episodes=n_episode_learn, n_episodes_per_fit=n_episode_per_fit)
        dataset = core.evaluate(n_episodes=n_episode_eval, render=False)
        metrics = compute_metrics(dataset)
        # logger.epoch_info(n+1, metrics=metrics)
        print("Epoch:",n+1,"metrics:",metrics)
        agent.save(f"dataset/PPO2/epoch_{n+1}.pkl")

    # logger.info('Tranining end')
    print("Training end")


In [6]:
train(n_epochs=10, n_episode_learn=1000, n_episode_eval=10, n_episode_per_fit=5)

                                               

Epoch: 0 metrics: (-4992.14380677302, -267.5048820310003, -1989.3761599160575, -1757.0554467926258, 10)
Training start


 10%|█         | 1/10 [54:52<8:13:52, 3292.49s/it]

Epoch: 1 metrics: (-4751.868966880249, -2335.0301214699375, -3485.3302996413404, -3572.6082714262766, 10)


 20%|██        | 2/10 [1:48:17<7:12:06, 3240.85s/it]

Epoch: 2 metrics: (-3660.4370470741437, 358.6663545132245, -1780.0691811621787, -2122.2385564724887, 10)


 30%|███       | 3/10 [2:45:34<6:28:33, 3330.57s/it]

Epoch: 3 metrics: (-1735.5040761933822, 281.13644188749765, -995.5624805656504, -1047.5318301626112, 10)


 40%|████      | 4/10 [3:42:11<5:35:41, 3356.91s/it]

Epoch: 4 metrics: (-2831.1859387271593, -1116.6242746480095, -2147.188542408853, -2322.363855299334, 10)


 50%|█████     | 5/10 [4:44:46<4:51:41, 3500.38s/it]

Epoch: 5 metrics: (-2528.0538587838405, -210.33383775199354, -1448.4856291174278, -1455.9069276596522, 10)


 60%|██████    | 6/10 [5:41:22<3:50:58, 3464.72s/it]

Epoch: 6 metrics: (-1853.1781504038697, -211.4482764859941, -1276.6910332846758, -1452.3667607820053, 10)


 70%|███████   | 7/10 [6:37:36<2:51:45, 3435.09s/it]

Epoch: 7 metrics: (-2106.673601343533, 99.78590338233307, -1128.1778793773851, -1163.270868535404, 10)


 80%|████████  | 8/10 [7:31:53<1:52:36, 3378.37s/it]

Epoch: 8 metrics: (-1743.8528023399085, 108.41786767452669, -950.538223178788, -874.4067718341444, 10)


 90%|█████████ | 9/10 [8:28:22<56:21, 3381.94s/it]  

Epoch: 9 metrics: (-2486.18798590034, -480.1391371448809, -1617.652262087477, -1744.2412107883458, 10)


100%|██████████| 10/10 [9:26:26<00:00, 3398.61s/it]

Epoch: 10 metrics: (-1675.5107914164626, 668.7345111020212, -771.8362605178797, -803.2625099056299, 10)
Training end





In [7]:
dataset = core.evaluate(n_episodes=5, render=True)

                                             

: 