In [1]:
from air_hockey_challenge.framework.air_hockey_challenge_wrapper import AirHockeyChallengeWrapper
from air_hockey_challenge.framework.challenge_core import ChallengeCore
from air_hockey_challenge.framework.agent_base import AgentBase
from examples.control.hitting_agent import build_agent, HittingAgent
from examples.control.hitting_agent_wait import HittingAgentWait

from mushroom_rl.utils.dataset import parse_dataset, select_random_samples
from mushroom_rl.policy import GaussianTorchPolicy

import torch
import torch.nn as nn

from tqdm import tqdm

import pickle

import numpy as np

use_cuda = torch.cuda.is_available()
device = 'cuda' if use_cuda else 'cpu'
print(f"Cuda: {use_cuda}")


Cuda: True


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

class DaggerDataset(Dataset):
    def __init__(self, states, expert_actions):
        self.states = torch.from_numpy(states).to(device)
        self.expert_actions = torch.from_numpy(expert_actions).to(device).reshape(-1,6)

    def add(self, state, expert_action):
        self.states = torch.cat((self.states, torch.from_numpy(state).to(device).reshape(-1,12)), dim = 0)
        self.expert_actions = torch.cat((self.expert_actions, torch.from_numpy(expert_action).to(device).reshape(-1,6)), dim=0)
        
    def save(self, path):
        with open(path, 'wb') as f:
            pickle.dump((self.states.to("cpu"), self.expert_actions.to("cpu")), f)

    def load(self, path):
        states, expert_actions = np.load(path, allow_pickle=True)
        self.states = states.to(device)
        self.expert_actions = expert_actions.to(device)

    def __len__(self):
        
        return len(self.states)

    def __getitem__(self, index):
        return self.states[index], self.expert_actions[index]


In [3]:
class Network(nn.Module):
    def __init__(self, input_shape, output_shape, n_features, **kwargs):
        super(Network, self).__init__()

        self._h1 = nn.Linear(input_shape[0], n_features)
        self._h2 = nn.Linear(n_features, n_features)
        self._h3 = nn.Linear(n_features, output_shape[0])

        nn.init.xavier_uniform_(self._h1.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h2.weight,
                                gain=nn.init.calculate_gain('tanh'))
        nn.init.xavier_uniform_(self._h3.weight,
                                gain=nn.init.calculate_gain('linear'))

    def forward(self, obs, **kwargs):
        features1 = torch.tanh(self._h1(torch.squeeze(obs, 1).float()))
        features2 = torch.tanh(self._h2(features1))
        return self._h3(features2)


class BCAgent(AgentBase):
    def __init__(self, env_info, policy, **kwargs):
        super().__init__(env_info, **kwargs)
        self.policy = policy

    def reset(self):
        pass

    def draw_action(self, observation):
        return self.policy.draw_action(observation).reshape(2,3)


In [4]:
from air_hockey_challenge.framework.evaluate_agent import PENALTY_POINTS
from air_hockey_challenge.utils.kinematics import forward_kinematics
import numpy as np


def custom_reward_function(base_env, state, action, next_state, absorbing):
    reward = -0.1 # small value to disincentivate long runs: faster is better

    q = state[base_env.env_info['joint_pos_ids']]
    dq = state[base_env.env_info['joint_vel_ids']]

    # Get a dictionary of the constraint functions {"constraint_name": ndarray}
    c = base_env.env_info['constraints'].fun(q, dq)

    # for k,v in c.items():
    #     reward -= PENALTY_POINTS[k] * np.sum(v > 0)
    # mybe not ideal: penalty is flat outside, does not know how to improve --> use violation instead of points?

    for k,val in c.items():
        for v in val:
            reward -= v if v > 0 else 0

    # TODO reward for good actions --> score
    # for now --> try to go to the puck
    
    ee_pos = forward_kinematics(base_env.env_info['robot']['robot_model'], base_env.env_info['robot']['robot_data'], q)[0]
    puck_pos = state[base_env.env_info['puck_pos_ids']]
    puck_vel = np.linalg.norm(state[base_env.env_info['puck_vel_ids']])
    reward += -np.linalg.norm(ee_pos - puck_pos) if puck_vel < 0.1 else puck_vel

    return reward

In [7]:
from mushroom_rl.utils.dataset import compute_metrics


def train_dagger_agent(learner_policy, expert_policy, mdp, dataset, num_iterations=10, batch_size=100, lr=0.001, n_steps=100, num_sub_epochs = 50, n_episode_eval=5):
    # Create data loader
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Set up the optimizer and loss function
    optimizer = optim.Adam(learner_policy.parameters(), lr=lr)

    # Train the initial policy on collected data
    for epoch in tqdm(range(num_iterations)):
        for sub_epoch in range(num_sub_epochs):
            # print(len(data_loader))
            for i in range(10):
                for states, expert_actions in data_loader:

                    optimizer.zero_grad()

                    states = states.to(device)
                    expert_actions = expert_actions.to(device)

                    loss = -learner_policy.log_prob_t(states, expert_actions).mean()

                    # Update the learner policy
                    loss.backward()
                    optimizer.step()

            # Evaluate the current learner policy
            learner_core = ChallengeCore(BCAgent(mdp.env_info, learner_policy), mdp)
            learner_trajectory = learner_core.evaluate(n_steps=n_steps)
            states, action, reward, next_state, absorbing, last = parse_dataset(learner_trajectory)

            i = 0
            for state in states:
                expert_policy.reset()
                expert_action = expert_policy.draw_action(state).astype(np.float32)
                if not expert_policy.optimization_failed:
                    dataset.add(state, expert_action.flatten())
                    i += 1
            # print(f"added {i} out of {len(states)}")
        
        core = ChallengeCore(BCAgent(mdp.env_info, learner_policy), mdp)
        eval_dataset = core.evaluate(n_episodes=n_episode_eval, render=False)
        metrics = compute_metrics(eval_dataset)
        print("Epoch:",epoch+1,"metrics:",metrics)
        learner_policy.save(f"dataset/Dagger/policy_{epoch+1}.pkl")
        dataset.save(f"dataset/Dagger/dataset_{epoch+1}.pkl")

    return learner_policy


# 1. defining BCAgent and expert agent to be trained in dagger

In [8]:
env = "3dof-hit"

mdp = AirHockeyChallengeWrapper(env, custom_reward_function=custom_reward_function)
mdp.reset()

# policy can only output 1D actions (6,) ... they need to be recast in (2,3) shape later on
policy = GaussianTorchPolicy(Network, (12,), (6,), std_0=1., n_features=64, use_cuda=use_cuda)

# policy = policy.load('dataset/hit_500_policy')

bc_agent = BCAgent(mdp.env_info, policy)
# dataset = MushroomRLTrajectoryDataset(mdp, bc_agent, n_episodes=10)

states, actions, reward, next_state, absorbing, last = parse_dataset(np.load(f"dataset/hit_500.pkl", allow_pickle=True))

dataset = DaggerDataset(states, actions)

expert_agent = HittingAgentWait(mdp.env_info)

In [None]:
# dagger_agent = train_dagger_agent(policy, expert_agent, mdp, dataset, num_iterations=1, batch_size=64, lr=0.001)
dagger_agent = train_dagger_agent(policy, expert_agent, mdp, dataset, num_iterations=20)
# dagger_agent = train_dagger_agent(policy, expert_agent, mdp, dataset, num_sub_epochs=2)

In [11]:
from air_hockey_challenge.framework.evaluate_agent import evaluate
from mushroom_rl.policy import GaussianTorchPolicy


def build_agent(env_info, **kwargs):
    """
    Function where an Agent that controls the environments should be returned.
    The Agent should inherit from the mushroom_rl Agent base env.

    :param env_info: The environment information
    :return: Either Agent ot Policy
    """
    if "hit" in env_info["env_name"]:
        # policy = GaussianTorchPolicy(Network, (12,), (6,), std_0=1., n_features=64, use_cuda=use_cuda)

        # policy = policy.load('dataset/hit_500_policy')
        dagger_ag = BCAgent(mdp.env_info, dagger_agent)
        # return agent
        return dagger_ag

eval = evaluate(build_agent, 'logs/dagger_agent', env_list=["3dof-hit"], n_episodes=50, render=False, quiet=False, n_cores=1)

                                               

Environment:        3dof-hit
Number of Episodes: 50
Success:            0.0000
Penalty:            348
Number of Violations: 
  joint_pos_constr  49
  joint_vel_constr  50
  ee_constr         50
  Jerk              50
  Total             199
-------------------------------------------------



In [12]:
eval = evaluate(build_agent, 'logs/dagger_agent', env_list=["3dof-hit"], n_episodes=5, render=True, quiet=False, n_cores=1)

                                             

Environment:        3dof-hit
Number of Episodes: 5
Success:            0.0000
Penalty:            35
Number of Violations: 
  joint_pos_constr  5
  joint_vel_constr  5
  ee_constr         5
  Jerk              5
  Total             20
-------------------------------------------------





: 

In [10]:
print(len(dataset))


states, actions, reward, next_state, absorbing, last = parse_dataset(np.load(f"dataset/hit_500.pkl", allow_pickle=True))

print(len(states))


127416
50823


In [13]:
with open("dataset/dagger", 'wb') as f:
    pickle.dump(dataset, f)

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [21]:
core = ChallengeCore(expert_agent, mdp)
eval_dataset = core.evaluate(n_episodes=5, render=False)



In [22]:
print(len(eval_dataset))

595


# testing expert agent

In [None]:
data_loader = DataLoader(dataset, batch_size=64, shuffle=True)

for i, data in enumerate(data_loader):
    states = data['state'].float()
    for state in states:
        expert_agent.reset()
        state_np = state.cpu().numpy()
        expert_action = expert_agent.draw_action(state_np)
        print(f"expert_action: {expert_action}")
        print(f"optimization_failed: {expert_agent.optimization_failed}")
    # expert_actions = expert_agent.draw_action(states).float()