In [1]:
import gym
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch as T
import torch.nn as nn

In [2]:
env_name = "BipedalWalker-v3"
env = gym.make(env_name)
print(env.observation_space.shape, env.action_space.shape)

(24,) (4,)


In [3]:
TEST_EPISODE_LENGTH = 6
for i in range(TEST_EPISODE_LENGTH):
    env.reset()
    episode_reward = 0
    while True:
        _, reward, done, _ = env.step(env.action_space.sample())
        episode_reward += reward
        if done:
            print(f"Episode {i+1} reward: {episode_reward}")
            break


Episode 1 reward: -83.90816557613067
Episode 2 reward: -76.44822417025898
Episode 3 reward: -104.40185153550236
Episode 4 reward: -103.83301595156112
Episode 5 reward: -100.77347654678921
Episode 6 reward: -102.69136026430813


In [3]:
class BCO(nn.Module):
    def __init__(self, env) -> None:
        super().__init__()
        self.action_dim = env.action_space.shape[0]
        self.observation_dim = env.observation_space.shape[0]
        self.policy_net = nn.Sequential(
            nn.Linear(self.observation_dim, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, self.action_dim),
        )
        self.inverse_net = nn.Sequential(
            nn.Linear(self.observation_dim * 2, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 128),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            nn.LeakyReLU(),
            nn.Linear(64, self.action_dim),
        )

    def predict(self, observation):
        out = self.policy_net(observation)
        return out

    def predict_inverse(self, observation1, observation2):
        observation = T.cat([observation1, observation2], dim=1)
        out = self.inverse_net(observation)
        return out

model = BCO(env).cuda()


In [10]:
from torch.utils.data import Dataset, DataLoader

class Dataset_Inverse(Dataset):
    def __init__(self, trajs) -> None:
        super().__init__()
        self.data = []
        for traj in trajs:
            for data in traj:
                # obs, new_obs, act = dat
                self.data.append(data)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
class Dataset_Policy(Dataset):
    def __init__(self, traj) -> None:
        super().__init__()
        self.data = []
        for data in traj:
            # obs, act = dat
            self.data.append(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]


# train an expert

In [4]:
from stable_baselines3 import SAC
from stable_baselines3.sac import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy



In [None]:
expert = SAC(
    policy=MlpPolicy,
    env=env,
    verbose=1
)

In [8]:
# https://www.gymlibrary.dev/environments/box2d/bipedal_walker/
# Good performance: reward > 300 
expert.learn(500000, log_interval=10)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 960      |
|    ep_rew_mean     | 303      |
| time/              |          |
|    episodes        | 10       |
|    fps             | 188      |
|    time_elapsed    | 50       |
|    total_timesteps | 9596     |
| train/             |          |
|    actor_loss      | -18.3    |
|    critic_loss     | 0.126    |
|    ent_coef        | 0.00915  |
|    ent_coef_loss   | 0.225    |
|    learning_rate   | 0.0003   |
|    n_updates       | 409195   |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 954      |
|    ep_rew_mean     | 304      |
| time/              |          |
|    episodes        | 20       |
|    fps             | 187      |
|    time_elapsed    | 101      |
|    total_timesteps | 19083    |
| train/             |          |
|    actor_loss      | -18.1    |
|    critic_loss     | 0.0903   |
|    ent_coef 

<stable_baselines3.sac.sac.SAC at 0x7f7926d50070>

In [10]:
reward, _ = evaluate_policy(expert, env, 10, render=False, warn=False)
print(reward)

306.72787686274387


In [29]:
expert.save('sac-expert-pedelwalker.zip')

In [6]:
expert = SAC.load('./sac-expert-pedelwalker.zip')

# get expert's traj

In [12]:
EPISODE_LENGTH = 500
trajs = []
while len(trajs) < 800:
    traj = []
    obs = env.reset()
    while True:
        action = expert.predict(obs)[0]
        new_obs, _, done, _ = env.step(action)
        traj.append([obs, new_obs, action])
        obs = new_obs
        if len(traj) >= EPISODE_LENGTH:
            trajs.append(traj)
            break
        elif done:
            break
    

In [34]:
# don't run these code, cannot reload
import pickle
trajs_txt = pickle.dumps(trajs)
with open('exprt_trajs.pkl', 'wb') as file:
    pickle.dump(trajs_txt, file)

# inverse traj demo dataset

In [14]:
BATCH_SIZE = 100
ld_demo = DataLoader(Dataset_Inverse(trajs), batch_size=BATCH_SIZE)
print(len(ld_demo))
for obs1, obs2, _ in ld_demo:
    print(obs1.shape, obs2.shape)
    break

4000
torch.Size([100, 24]) torch.Size([100, 24])


In [15]:
loss_func = nn.MSELoss().cuda()
optim = T.optim.Adam(model.parameters())

EPOCHES = 1000
M = 5000

EPS = 0.99
DECAY = 0.9

In [21]:
from collections import deque

In [None]:

# trajs_inv = []
trajs_inv = deque(maxlen=5000)
for e in tqdm(range(EPOCHES)):
    
    # step1, generate inverse samples
    cnt = 0
    epn = 0
    
    rews = 0
    
#     trajs_inv = []
        
    while True:
        traj = []
        rew = 0
            
        obs = env.reset()
        while True:
            inp = T.from_numpy(obs).view(((1, )+obs.shape)).float().cuda()
            out = model.predict(inp).cpu().detach().numpy()
                
            if np.random.rand()>=EPS:
                act = out[0]
            else:
                act = env.action_space.sample()
                
            new_obs, r, done, _ = env.step(act)
                
            traj.append([obs, new_obs, act])
            obs = new_obs
            rew += r
            
            cnt += 1
                
            if done==True:
                rews += rew
                trajs_inv.append(traj)
                
                epn += 1
                
                break
        
        if cnt >= M:
            break
        
    rews /= epn
    print('Ep %d: reward=%.2f' % (e+1, rews))
        
    # step2, update inverse model
    ld_inv = DataLoader(Dataset_Inverse(trajs_inv), batch_size=BATCH_SIZE, shuffle=True)
    
    with tqdm(ld_inv) as TQ:
        ls_ep = 0
        
        for obs1, obs2, act in TQ:
            out = model.predict_inverse(obs1.float().cuda(), obs2.float().cuda())
            ls_bh = loss_func(out, act.cuda())
            
            optim.zero_grad()
            ls_bh.backward()
            optim.step()
            
            ls_bh = ls_bh.cpu().detach().numpy()
            TQ.set_postfix(loss_inv='%.3f' % (ls_bh))
            ls_ep += ls_bh
        
        ls_ep /= len(TQ)
        print('Ep %d: loss_inv=%.3f' % (e+1, ls_ep))
    
    # step3, predict inverse action for demo samples
    traj_policy = []
    
    for obs1, obs2, _ in ld_demo:
        out = model.predict_inverse(obs1.float().cuda(), obs2.float().cuda())
        
        obs = obs1.cpu().detach().numpy()
        out = out.cpu().detach().numpy()
        
        for i in range(BATCH_SIZE):
            traj_policy.append([obs[i], out[i]])
    
    # step4, update policy via demo samples
    ld_policy = DataLoader(Dataset_Policy(traj_policy), batch_size=BATCH_SIZE, shuffle=True)
    
    with tqdm(ld_policy) as TQ:
        ls_ep = 0
        
        for obs, act in TQ:
            out = model.predict(obs.float().cuda())
            ls_bh = loss_func(out, act.cuda())
            
            optim.zero_grad()
            ls_bh.backward()
            optim.step()
            
            ls_bh = ls_bh.cpu().detach().numpy()
            TQ.set_postfix(loss_policy='%.3f' % (ls_bh))
            ls_ep += ls_bh
        
        ls_ep /= len(TQ)
        print('Ep %d: loss_policy=%.3f' % (e+1, ls_ep))
    
    # step5, save model
    T.save(model.state_dict(), 'Model/model_reacher_%d.pt' % (e+1))
    
    EPS *= DECAY