In [15]:
import numpy as np
import matplotlib.pyplot as plt
import torch

import gym
from gym import wrappers

import torch

import tianshou as ts

import TeachMyAgent as tma



In [3]:
import argparse
import os
import pprint

import gym
import numpy as np
import torch

from torch.utils.tensorboard import SummaryWriter


from tianshou.data import Collector, VectorReplayBuffer
from tianshou.env import SubprocVectorEnv
from tianshou.policy import SACPolicy
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
from tianshou.utils.net.continuous import ActorProb, Critic

In [4]:


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', type=str, default="BipedalWalkerHardcore-v3")
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--buffer-size', type=int, default=1000000)
    parser.add_argument('--actor-lr', type=float, default=3e-4)
    parser.add_argument('--critic-lr', type=float, default=1e-3)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--tau', type=float, default=0.005)
    parser.add_argument('--alpha', type=float, default=0.1)
    parser.add_argument('--auto-alpha', type=int, default=1)
    parser.add_argument('--alpha-lr', type=float, default=3e-4)
    parser.add_argument('--epoch', type=int, default=100)
    parser.add_argument('--step-per-epoch', type=int, default=100000)
    parser.add_argument('--step-per-collect', type=int, default=10)
    parser.add_argument('--update-per-step', type=float, default=0.1)
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--hidden-sizes', type=int, nargs='*', default=[128, 128])
    parser.add_argument('--training-num', type=int, default=10)
    parser.add_argument('--test-num', type=int, default=100)
    parser.add_argument('--logdir', type=str, default='log')
    parser.add_argument('--render', type=float, default=0.)
    parser.add_argument('--n-step', type=int, default=4)
    parser.add_argument(
        '--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu'
    )
    parser.add_argument('--resume-path', type=str, default=None)
    return parser.parse_args("")



In [5]:
class Wrapper(gym.Wrapper):
    """Env wrapper for reward scale, action repeat and removing done penalty"""

    def __init__(self, env, action_repeat=3, reward_scale=5, rm_done=True):
        super().__init__(env)
        self.action_repeat = action_repeat
        self.reward_scale = reward_scale
        self.rm_done = rm_done

    def step(self, action):
        r = 0.0
        for _ in range(self.action_repeat):
            obs, reward, done, info = self.env.step(action)
            # remove done reward penalty
            if not done or not self.rm_done:
                r = r + reward
            if done:
                break
        # scale reward
        return obs, self.reward_scale * r, done, info

In [6]:
args = get_args()

env = Wrapper(gym.make(args.task))

args.state_shape = env.observation_space.shape or env.observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = env.action_space.high[0]

In [7]:
train_envs = SubprocVectorEnv(
    [lambda: Wrapper(gym.make(args.task)) for _ in range(args.training_num)]
)
# test_envs = gym.make(args.task)
test_envs = SubprocVectorEnv(
    [
        lambda: Wrapper(gym.make(args.task), reward_scale=1, rm_done=False)
        for _ in range(args.test_num)
    ]
)

# seed
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed);

In [8]:
# model
net_a = Net(args.state_shape, hidden_sizes=args.hidden_sizes, device=args.device)
actor = ActorProb(
    net_a,
    args.action_shape,
    max_action=args.max_action,
    device=args.device,
    unbounded=True
).to(args.device)
actor_optim = torch.optim.Adam(actor.parameters(), lr=args.actor_lr)

net_c1 = Net(
    args.state_shape,
    args.action_shape,
    hidden_sizes=args.hidden_sizes,
    concat=True,
    device=args.device
)
critic1 = Critic(net_c1, device=args.device).to(args.device)
critic1_optim = torch.optim.Adam(critic1.parameters(), lr=args.critic_lr)

net_c2 = Net(
    args.state_shape,
    args.action_shape,
    hidden_sizes=args.hidden_sizes,
    concat=True,
    device=args.device
)
critic2 = Critic(net_c2, device=args.device).to(args.device)
critic2_optim = torch.optim.Adam(critic2.parameters(), lr=args.critic_lr)

if args.auto_alpha:
    target_entropy = -np.prod(env.action_space.shape)
    log_alpha = torch.zeros(1, requires_grad=True, device=args.device)
    alpha_optim = torch.optim.Adam([log_alpha], lr=args.alpha_lr)
    args.alpha = (target_entropy, log_alpha, alpha_optim)

policy = SACPolicy(
    actor,
    actor_optim,
    critic1,
    critic1_optim,
    critic2,
    critic2_optim,
    tau=args.tau,
    gamma=args.gamma,
    alpha=args.alpha,
    estimation_step=args.n_step,
    action_space=env.action_space
)


In [13]:
# load a previous policy
if args.resume_path:
    policy.load_state_dict(torch.load(args.resume_path))
    print("Loaded agent from: ", args.resume_path)

# collector
train_collector = Collector(
    policy,
    train_envs,
    VectorReplayBuffer(args.buffer_size, len(train_envs)),
    exploration_noise=True
)
test_collector = Collector(policy, test_envs)
# train_collector.collect(n_step=args.buffer_size)
# log
log_path = os.path.join(args.logdir, args.task, 'sac')
# writer = SummaryWriter(log_path)
# logger = TensorboardLogger(writer)

def save_fn(policy):
    print('saving')
    # torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth'))

def stop_fn(mean_rewards):
    return mean_rewards >= env.spec.reward_threshold

In [14]:
# trainer
result = offpolicy_trainer(
    policy,
    train_collector,
    test_collector,
    args.epoch,
    args.step_per_epoch,
    args.step_per_collect,
    args.test_num,
    args.batch_size,
    update_per_step=args.update_per_step,
    test_in_train=False,
    stop_fn=stop_fn,
    save_fn=save_fn,
    # logger=logger
)


saving


Epoch #1: 100001it [08:11, 203.33it/s, alpha=0.104, env_step=100000, len=0, loss/actor=-40.498, loss/alpha=-0.782, loss/critic1=176.738, loss/critic2=174.742, n/ep=0, n/st=10, rew=0.00]                               


Epoch #1: test_reward: -47.791163 ± 29.421405, best_reward: -21.486006 ± 35.335554 in #0


Epoch #2: 100001it [07:46, 214.48it/s, alpha=0.199, env_step=200000, len=0, loss/actor=-62.109, loss/alpha=0.093, loss/critic1=266.739, loss/critic2=260.335, n/ep=0, n/st=10, rew=0.00]                                 


Epoch #2: test_reward: -59.329818 ± 34.202553, best_reward: -21.486006 ± 35.335554 in #0


Epoch #3: 100001it [06:39, 250.43it/s, alpha=0.227, env_step=300000, len=0, loss/actor=-78.180, loss/alpha=-0.073, loss/critic1=329.360, loss/critic2=334.073, n/ep=0, n/st=10, rew=0.00]                                


saving
Epoch #3: test_reward: 2.199228 ± 92.880750, best_reward: 2.199228 ± 92.880750 in #3


Epoch #4: 100001it [06:26, 258.63it/s, alpha=0.256, env_step=400000, len=0, loss/actor=-97.203, loss/alpha=-0.015, loss/critic1=435.555, loss/critic2=454.688, n/ep=0, n/st=10, rew=0.00]                                


saving
Epoch #4: test_reward: 46.998529 ± 111.960033, best_reward: 46.998529 ± 111.960033 in #4


Epoch #5: 100001it [06:33, 253.83it/s, alpha=0.278, env_step=500000, len=0, loss/actor=-115.976, loss/alpha=-0.069, loss/critic1=525.618, loss/critic2=528.997, n/ep=0, n/st=10, rew=0.00]                                


Epoch #5: test_reward: 33.326299 ± 126.818333, best_reward: 46.998529 ± 111.960033 in #4


Epoch #6: 100001it [06:52, 242.20it/s, alpha=0.300, env_step=600000, len=0, loss/actor=-132.136, loss/alpha=0.056, loss/critic1=584.567, loss/critic2=612.025, n/ep=0, n/st=10, rew=0.00]                                 


saving
Epoch #6: test_reward: 66.028082 ± 125.914366, best_reward: 66.028082 ± 125.914366 in #6


Epoch #7: 100001it [07:21, 226.51it/s, alpha=0.327, env_step=700000, len=0, loss/actor=-144.947, loss/alpha=-0.186, loss/critic1=659.711, loss/critic2=672.923, n/ep=0, n/st=10, rew=0.00]                                


Epoch #7: test_reward: 35.044802 ± 105.156601, best_reward: 66.028082 ± 125.914366 in #6


Epoch #8: 100001it [07:41, 216.63it/s, alpha=0.298, env_step=800000, len=0, loss/actor=-155.390, loss/alpha=-0.189, loss/critic1=678.853, loss/critic2=701.587, n/ep=0, n/st=10, rew=0.00]                                


Epoch #8: test_reward: 57.743687 ± 125.716524, best_reward: 66.028082 ± 125.914366 in #6


Epoch #9: 100001it [06:44, 247.49it/s, alpha=0.294, env_step=900000, len=0, loss/actor=-162.459, loss/alpha=0.035, loss/critic1=693.644, loss/critic2=716.336, n/ep=0, n/st=10, rew=0.00]                                 


saving
Epoch #9: test_reward: 72.107249 ± 134.198899, best_reward: 72.107249 ± 134.198899 in #9


Epoch #10:  12%|#2        | 12050/100000 [00:50<06:06, 239.87it/s, alpha=0.312, env_step=912040, len=0, loss/actor=-165.178, loss/alpha=-0.082, loss/critic1=713.259, loss/critic2=727.375, n/ep=0, n/st=10, rew=0.00]     

KeyboardInterrupt



In [None]:
pprint.pprint(result)
# Let's watch its performance!
policy.eval()
test_envs.seed(args.seed)
test_collector.reset()
result = test_collector.collect(n_episode=args.test_num, render=args.render)
rews, lens = result["rews"], result["lens"]
print(f"Final reward: {rews.mean()}, length: {lens.mean()}")

In [10]:
import matplotlib.pyplot as plt
import gym

env = gym.make('CartPole-v1')

env.gravity = 20
print(env.gravity)
env.print_gravity()

env.set_gravity(40)
print(env.gravity)
env.print_gravity()


20
9.8
20
40


In [4]:
import matplotlib.pyplot as plt
import gym
import numpy as np


for gravity in np.linspace(1, 50, 20):
    env = gym.make('CartPole-v1')
 
    np.random.seed(0)
    env.seed(0)

    obs = env.reset()

    done = False

    eprew = 0
    eplen = 0
    while not done:
        env.set_gravity(gravity)
        
        obs, rew, done, info = env.step(eplen%2)
        eprew += rew
        eplen += 1

    env.close()
    eplen, eprew
    
    print(eprew)
    print(env.state)


56.0
(-0.10434375041651943, 0.03906063338627813, 0.21477228615085472, 0.1514375271789289)
41.0
(-0.08893778710637525, -0.16590066610000895, 0.2140352901972855, 0.6700038126959267)
35.0
(-0.08357150201210116, -0.17310259752823062, 0.21538388606672643, 0.8313309310765957)
31.0
(-0.07971880968863711, -0.1780293823019942, 0.21013077059874896, 0.9404867148296607)
29.0
(-0.07830760800256713, -0.18442663041693377, 0.2190678708087497, 1.0857062251349299)
27.0
(-0.07643739300199816, -0.18865499807711522, 0.21770639374702178, 1.180067122762774)
26.0
(-0.07799230889326551, -0.0006236761621742082, 0.23210123800057633, 1.0438558939224978)
24.0
(-0.07543778581263856, -0.000660368333622291, 0.21537995433144305, 1.0406792730673975)
23.0
(-0.072625177660847, -0.19896599491844436, 0.21338378626713403, 1.409641667687952)
22.0
(-0.07354612031091992, -0.006840778303437101, 0.2135435471586821, 1.178639142477889)
22.0
(-0.07449019971258565, -0.01576179390213972, 0.23471173567526246, 1.384765219844419)
21.0
(

In [36]:
# train_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(10)])
# test_envs = ts.env.DummyVectorEnv([lambda: gym.make('CartPole-v0') for _ in range(100)])

env = gym.make('CartPole-v0')
train_envs = gym.make('CartPole-v0')
test_envs = gym.make('CartPole-v0')
train_envs.gravity = 50
test_envs.gravity = 50

In [13]:
import torch, numpy as np
from torch import nn

class Net(nn.Module):
    def __init__(self, state_shape, action_shape):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(np.prod(state_shape), 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, 128), nn.ReLU(inplace=True),
            nn.Linear(128, np.prod(action_shape)),
        )

    def forward(self, obs, state=None, info={}):
        if not isinstance(obs, torch.Tensor):
            obs = torch.tensor(obs, dtype=torch.float)
        batch = obs.shape[0]
        logits = self.model(obs.view(batch, -1))
        return logits, state

state_shape = env.observation_space.shape or env.observation_space.n
action_shape = env.action_space.shape or env.action_space.n
net = Net(state_shape, action_shape)
optim = torch.optim.Adam(net.parameters(), lr=1e-3)
policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)

In [39]:
train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

In [40]:
# pre-collect at least 5000 transitions with random action before training
train_collector.collect(n_step=5000, random=True)

policy.set_eps(0.1)
for i in range(int(1e6)):  # total step
    collect_result = train_collector.collect(n_step=10)

    # once if the collected episodes' mean returns reach the threshold,
    # or every 1000 steps, we test it on test_collector
    if collect_result['rews'].mean() >= env.spec.reward_threshold or i % 1000 == 0:
        policy.set_eps(0.05)
        result = test_collector.collect(n_episode=100)
        print('testing collector rewards: ', result['rews'].mean())
        if result['rews'].mean() >= env.spec.reward_threshold:
            print(f'Finished training! Test mean returns: {result["rews"].mean()}')
            break
        else:
            # back to training eps
            policy.set_eps(0.1)

    # train policy with a sampled batch data from buffer
    losses = policy.update(64, train_collector.buffer)

testing collector rewards:  9.45


  # Remove the CWD from sys.path while we load stuff.


testing collector rewards:  199.99
Finished training! Test mean returns: 199.99


In [16]:
espace = np.linspace(0, 50, 10)[:, None]

In [17]:
espace.shape

(10, 1)

In [14]:
train_envs = gym.make('CartPole-v0')
test_envs = gym.make('CartPole-v0')

train_envs.gravity = 50
test_envs.gravity = 50

# train_collector = ts.data.Collector(policy, train_envs, ts.data.VectorReplayBuffer(20000, 10), exploration_noise=True)
# test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)

def random_policy():
    state_shape = env.observation_space.shape or env.observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    net = Net(state_shape, action_shape)
    optim = torch.optim.Adam(net.parameters(), lr=1e-3)
    policy = ts.policy.DQNPolicy(net, optim, discount_factor=0.9, estimation_step=3, target_update_freq=320)
    return policy


def train_policy(policy, env_param):
    policy.set_eps(0.1)

    return policy


# scores = np.zeros(espace.shape[:-1])
# policies = np.empty(espace.shape[:-1], dtype=object)

# for i in range(1000):
#     if i<G:
#         p_old = random_policy()
#     else:
#         coor_old = random_coordinate()
#         p_old = policies[coor_old]
#     # TODO: maybe add random mutation noise to escape local minima?
    
#     coor_new = random_coordinate()
#     p_new = train_policy(p_old, espace[coor_new])
    
#     score_new = test_policy(p_new)
#     if policies[coor_new] is None or score_new > scores[coor_new]:
#         scores[coor_new] = score_new
#         policies[coor_new] = p_new

In [11]:
def test_policy(policy):
    policy.set_eps(0.05)
    results = []
    for e in espace:
        print('starting ', e)
        # test_envs.gravity = e[0]
        test_envs.set_gravity(e[0])
        test_envs.reset()
        test_collector = ts.data.Collector(policy, test_envs, exploration_noise=True)
        
        result = test_collector.collect(n_episode=500)
        results.append(result['rews'])
        
    results = np.array(results)
    return results

plt.plot(espace[:, 0], test_policy(random_policy()).mean(axis=1))

NameError: name 'espace' is not defined

In [76]:
test_collector.env.gravity

[15.0]