In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from envs.ctcartpole import CTCartpole
from envs.ctpendulum import CTPendulum
from envs.ctacrobot import CTAcrobot
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torchdiffeq import odeint
from basic_mdl import basic_mdl
from utils import *
import os
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = CTAcrobot(dt=0.1, device='cuda', obs_trans=False)


Running fully actuated Acrobot
[-0.0025, 1.0]


In [3]:
tanh_ = nn.Tanh()

def final_activation(env, a):
    return tanh_(a) * env.act_rng

class Policy(nn.Module):
    def __init__(self, env, nl=2, nn=200, act='relu'):
        super().__init__()
        self.env = env
        self.act = act
        self._g = basic_mdl(env.n, env.m, n_hid_layers=nl, act=act, n_hidden=nn, dropout=0.0)
        self.reset_parameters()

    def reset_parameters(self, w=1.0):
        self._g.reset_parameters(w)
    
    def forward(self, s, t):
        s = s.to(self.env.device)  # Ensure s is on the correct device
        a = self._g(s)
        return final_activation(self.env, a)

In [None]:
import torch
import torch.nn as nn
import torch.distributions as distributions

class StochasticPolicy(nn.Module):
    def __init__(self, env, nl=2, nn=200, act='relu'):
        super(StochasticPolicy, self).__init__()
        self.env = env

        # Ensure env.m is a tensor (assuming it's an integer initially)
        if not isinstance(env.m, torch.Tensor):
            self.env.m = torch.tensor(env.m)

        # Initialize the mean network (reuse your existing Policy._g)
        self._g = basic_mdl(env.n, env.m, n_hid_layers=nl, act=act, n_hidden=nn, dropout=0.0)

        # Create separate networks for mean and log std
        self._mean_layer = nn.Linear(self._g.out_features, env.m)  # Output size is number of actions
        self._log_std_layer = nn.Linear(self._g.out_features, env.m)  # Output size is number of actions for log std

        self.reset_parameters()

    def reset_parameters(self, w=0.1):
        self._g.reset_parameters(w)
        nn.init.xavier_uniform_(self._mean_layer.weight)
        nn.init.constant_(self._mean_layer.bias, 0)
        nn.init.constant_(self._log_std_layer.bias, -0.5)  # Initialize log std with a small value

    def forward(self, s, t):
        s = s.to(self.env.device)  # Ensure s is on the correct device
        x = self._g(s)  # Pass through the base network

        # Separate predictions for mean and log std
        mean = self._mean_layer(x)
        log_std = self._log_std_layer(x)

        # Ensure the log_std is expanded to match the mean's dimensions
        log_std = log_std.expand_as(mean)
        std = torch.exp(log_std)

        # Create a Normal distribution with the mean and std
        dist = distributions.Normal(mean, std)

        # Sample action from the distribution using the reparameterization trick
        a = dist.rsample()
        action = final_activation(self.env, a)

        return action, dist

    def get_action(self, s, t):
        action, _ = self.forward(s, t)
        return action

    def get_log_prob(self, s, t, action):
        _, dist = self.forward(s, t)
        log_prob = dist.log_prob(action).sum(-1)  # Sum over the action dimensions
        return log_prob
    
policy_nn = StochasticPolicy(env)

Cartpole Swingup = FALSE

In [None]:
def discount_rewards(r, gamma):
    discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# random seed

# Set seed for reproducibility
device = env.device

# Initialize policy network
policy_nn = Policy(env)
policy_nn.to(device)

# Initialize value network
us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.reset_parameters()
us_V.to(device)

# Initialize optimizers
policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)

# Training parameters
num_episodes =250
num_steps = 20
num_rounds = 50
gamma = 0.99
tau = 2.0


flag = True

for rounds in range(num_rounds):

    # if mean_reward > 0.2 decreased the learning rate

    rewards,opt_objs = [],[]
    for episode in range(num_episodes):
        if episode%50==0:
            Vtarget = copy.deepcopy(us_V)


        initial_observations = [env.reset() for _ in range(50)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        ts = env.build_time_grid(num_steps).to(device)
        policy_opt.zero_grad()

        st, at, rt, ts  = env.integrate_system(T=num_steps, g=policy_nn, s0=s0, N=1)
        # print(rt.shape)
        rew_int  = rt[:,-1].mean(0)  # N
        # print(rew_int.shape)
    
        # print(st.shape)
        st = torch.cat([st]*5) if st.shape[0]==1 else st
        # print(st.shape)
        ts = ts[0]
        gammas = (-ts/tau).exp() # H
        # print(us_V(st.contiguous()).shape)
        V_st_gam = us_V(st.contiguous())[:,1:,0] * gammas[1:] # L,N,H-1
        # print(V_st_gam.shape)
        V_const = min(rounds/5.0,1)
        # print((V_const*V_st_gam).shape)
        # print(rt[:,1:].shape)
        n_step_returns = rt[:,:,1:].squeeze(0) + V_const*V_st_gam # ---> n_step_returns[:,:,k] is the sum in (5)
        # print("nstep",n_step_returns.shape)
        optimized_returns = n_step_returns.mean(-1) # L,N
        # print("optimized",optimized_returns.shape)
        # print(optimized_returns)
        mean_cost = -optimized_returns.mean()
        # print(mean_cost.shape)
        # print(mean_cost)
        mean_cost.backward()
        grad_norm = torch.norm(flatten_([p.grad for p in policy_nn.parameters()])).item()
        policy_opt.step()

        rewards.append(rew_int.mean().item()/2.0)
        opt_objs.append(mean_cost.mean().item())
        print_log = 'Round: {:4d}/{:<4d}, Iter:{:4d}/{:<4d},  opt. target:{:.3f}  mean reward:{:.3f}  '\
                .format(rounds,num_rounds, episode, num_episodes, np.mean(opt_objs), np.mean(rewards)) + \
                'H={:.2f},  grad_norm={:.3f},  '.format(2.0,grad_norm)
        
        # if rounds > 0:
        #     if np.mean(rewards) > 0.15 and flag:
        #         policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
        #         us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)
        #         flag = False

        with torch.no_grad():
            # regress all intermediate values
            # print("Regressing all intermediate values")
            # print(st.detach().contiguous().shape)
            last_states = st.detach().contiguous()[:,1:,:] # L,N,T-1,n
            # print("last",last_states.shape)
            last_values = Vtarget(last_states).squeeze(-1)
            # print("last_val",last_values.shape)
            # print(((-ts[1:]/tau).exp()*last_values ).shape)
            # print((rt[:,1:,:].squeeze()).shape)
            Vtargets = rt[:,:,1:].squeeze(0) + (-ts[1:]/tau).exp()*last_values # L,N,T-1
            # print("Vvv",Vtargets.shape)
            Vtargets = Vtargets.mean(-1)
            # print(Vtargets.shape)
        mean_val_err = 0


        for inner_iter in range(5):
            us_V_opt.zero_grad()
            # print("USV",us_V(s0).squeeze(-1).shape)
            # print(s0.shape)
            td_error = us_V(s0).squeeze(-1) - Vtargets # L,N
            td_error = torch.mean(td_error**2)
            # print(td_error)
            td_error.backward()
            mean_val_err += td_error.item() / 10
            if inner_iter==0:
                first_val_err = td_error.item()
            us_V_opt.step()

        if episode%(num_episodes//10)==0:
            print(print_log)
            # Save the model indicate rounds and and episode
            if not os.path.exists('models/'):
                os.makedirs('models/')

            if not os.path.exists('models/policy6'):
                os.makedirs('models/policy6')
            
            if not os.path.exists('models/value6'):
                os.makedirs('models/value6')

            

            print("Saving model at round {} and episode {}".format(rounds, episode))
            torch.save(policy_nn.state_dict(), 'models/policy6/round_{}_episode_{}.pt'.format(rounds, episode))
            torch.save(us_V.state_dict(), 'models/value6/round_{}_episode_{}.pt'.format(rounds, episode))



    with torch.no_grad():
        Htest,Ntest,Tup = 30,10,int(3.0/0.1)
        initial_observations = [env.reset() for _ in range(10)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        test_states, test_actions, test_rewards,_ = env.integrate_system(T=200, s0=s0, g=policy_nn)
        
        true_test_rewards = test_rewards[...,Tup:].mean().item()
        print("True test rewards: ", true_test_rewards)
        
        if true_test_rewards > 0.9:
            print("Test rewards > 0.9. Training complete...")

            break

        env.close()

In [None]:
# Run saved policy on the environment
import time
device = 'cuda'
rounds = 2
episode = 200
policy_nn = Policy(env)
policy_nn.load_state_dict(torch.load('models/policy6/round_{}_episode_{}.pt'.format(rounds, episode)))
policy_nn.to(device)
policy_nn.eval()

us_V = basic_mdl(env.n, env.m, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.load_state_dict(torch.load('models/value6/round_{}_episode_{}.pt'.format(rounds, episode)))
us_V.to(device)
us_V.eval()



with torch.no_grad():
    Htest,Ntest,Tup = 30,10,int(3.0/0.1)
    initial_observations = [env.reset() for _ in range(10)]
    s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
    test_states,test_actions, test_rewards,_ = env.integrate_system(T=500, s0=s0, g=policy_nn)
    # print(test_rewards)
    # print(test_actions)
    true_test_rewards = test_rewards[...,Tup:].mean().item()
    print('True test reward: {:.3f}'.format(true_test_rewards))
    for step in range(500):
        observation = test_states[6, step].cpu().numpy()
        env.set_state_(observation)
        time.sleep(0.01)

        img = env.render(mode='rgb_array')


    env.close()

Acrobot


In [5]:
import time
def discount_rewards(r, gamma):
    discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# random seed

# Set seed for reproducibility
device = env.device

# Initialize policy network
policy_nn = Policy(env)
policy_nn.to(device)

# Initialize value network
us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.reset_parameters()
us_V.to(device)

# Initialize optimizers
policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)

# Training parameters
num_episodes =50
num_steps = 20
num_rounds = 50
gamma = 0.99
tau = 2.0


flag = True

for rounds in range(num_rounds):

    # if mean_reward > 0.2 decreased the learning rate


    rewards,opt_objs = [],[]
    for episode in range(num_episodes):
        if episode%50==0:
            Vtarget = copy.deepcopy(us_V)


        initial_observations = [env.reset() for _ in range(50)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        ts = env.build_time_grid(num_steps).to(device)
        policy_opt.zero_grad()

        st, at, rt, ts  = env.integrate_system(T=num_steps, g=policy_nn, s0=s0, N=1)

        rew_int  = rt[:,-1].mean(0)  # N
        # print(rew_int.shape)
    
        # print(st.shape)
        st = torch.cat([st]*5) if st.shape[0]==1 else st
        # print(st.shape)
        ts = ts[0]
        gammas = (-ts/tau).exp() # H
        # print(us_V(st.contiguous()).shape)
        V_st_gam = us_V(st.contiguous())[:,1:,0] * gammas[1:] # L,N,H-1
        # print(V_st_gam.shape)
        V_const = min(rounds/5.0,1)
        # print((V_const*V_st_gam).shape)
        # print(rt[:,1:].shape)
        n_step_returns = rt[:,1:] + V_const*V_st_gam # ---> n_step_returns[:,:,k] is the sum in (5)
        # print("nstep",n_step_returns.shape)
        optimized_returns = n_step_returns.mean(-1) # L,N
        # print("optimized",optimized_returns.shape)
        # print(optimized_returns)
        mean_cost = -optimized_returns.mean()
        # print(mean_cost.shape)
        # print(mean_cost)
        mean_cost.backward()
        grad_norm = torch.norm(flatten_([p.grad for p in policy_nn.parameters()])).item()
        policy_opt.step()

        rewards.append(rew_int.mean().item()/2.0)
        opt_objs.append(mean_cost.mean().item())
        print_log = 'Round: {:4d}/{:<4d}, Iter:{:4d}/{:<4d},  opt. target:{:.3f}  mean reward:{:.3f}  '\
                .format(rounds,num_rounds, episode, num_episodes, np.mean(opt_objs), np.mean(rewards)) + \
                'H={:.2f},  grad_norm={:.3f},  '.format(2.0,grad_norm)
        
        # if rounds > 0:
        #     if np.mean(rewards) > 0.15 and flag:
        #         policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
        #         us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)
        #         flag = False

        with torch.no_grad():
            # regress all intermediate values
            # print("Regressing all intermediate values")
            # print(st.detach().contiguous().shape)
            last_states = st.detach().contiguous()[:,1:,:] # L,N,T-1,n
            # print("last",last_states.shape)
            last_values = Vtarget(last_states).squeeze(-1)
            # print("last_val",last_values.shape)
            # print(((-ts[1:]/tau).exp()*last_values ).shape)
            # print((rt[:,1:,:].squeeze()).shape)
            Vtargets = rt[:,1:].squeeze() + (-ts[1:]/tau).exp()*last_values # L,N,T-1
            # print("Vvv",Vtargets.shape)
            Vtargets = Vtargets.mean(-1)
            # print(Vtargets.shape)
        mean_val_err = 0


        for inner_iter in range(5):
            us_V_opt.zero_grad()
            # print("USV",us_V(s0).squeeze(-1).shape)
            # print(s0.shape)
            td_error = us_V(s0).squeeze(-1) - Vtargets # L,N
            td_error = torch.mean(td_error**2)
            # print(td_error)
            td_error.backward()
            mean_val_err += td_error.item() / 10
            if inner_iter==0:
                first_val_err = td_error.item()
            us_V_opt.step()

        if episode%(num_episodes//5)==0:
            print(print_log)
            # Save the model indicate rounds and and episode
            if not os.path.exists('models/'):
                os.makedirs('models/')

            if not os.path.exists('models/policy7'):
                os.makedirs('models/policy7')
            
            if not os.path.exists('models/value7'):
                os.makedirs('models/value7')

            

            print("Saving model at round {} and episode {}".format(rounds, episode))
            torch.save(policy_nn.state_dict(), 'models/policy7/round_{}_episode_{}.pt'.format(rounds, episode))
            torch.save(us_V.state_dict(), 'models/value7/round_{}_episode_{}.pt'.format(rounds, episode))



    with torch.no_grad():
        Htest,Ntest,Tup = 30,10,int(3.0/0.1)
        initial_observations = [env.reset() for _ in range(10)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        test_states, test_actions, test_rewards,_ = env.integrate_system(T=200, s0=s0, g=policy_nn)
        
        true_test_rewards = test_rewards[...,Tup:].mean().item()
        print("True test rewards: ", true_test_rewards)
        for step in range(200):
            observation = test_states[0, step].cpu().numpy()
            env.set_state_(observation)
            time.sleep(0.01)
            img = env.render(mode='rgb_array')
        
        if true_test_rewards > 0.9:
            print("Test rewards > 0.9. Training complete...")

            break

        env.close()

Round:    0/50  , Iter:   0/50  ,  opt. target:0.001  mean reward:-0.000  H=2.00,  grad_norm=0.002,  
Saving model at round 0 and episode 0


KeyboardInterrupt: 

Cartpole Swingup = TRUE and Pendulum

In [5]:
import wandb
def discount_rewards(r, gamma):
    discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# random seed
wandb.init(project="swingup_ctpole", entity="bavci")
# Set seed for reproducibility
device = env.device

# Initialize policy network
policy_nn = Policy(env)
policy_nn.to(device)

# Initialize value network
us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.reset_parameters()
us_V.to(device)

# Initialize optimizers
policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)

# Training parameters
num_episodes =250
num_steps = 20
num_rounds = 50
gamma = 0.99
tau = 2.0


flag = True

for rounds in range(num_rounds):

    # if mean_reward > 0.2 decreased the learning rate


    rewards,opt_objs = [],[]
    for episode in range(num_episodes):
        if episode%50==0:
            Vtarget = copy.deepcopy(us_V)


        initial_observations = [env.reset() for _ in range(50)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        ts = env.build_time_grid(num_steps).to(device)
        policy_opt.zero_grad()



        st, at, rt, ts  = env.integrate_system(T=num_steps, g=policy_nn, s0=s0, N=1)
        # print(rt.shape)
        rew_int  = rt[:,-1].mean(0)  # N
        # print(rew_int.shape)
    
        # print(st.shape)
        st = torch.cat([st]*5) if st.shape[0]==1 else st
        # print(st.shape)
        ts = ts[0]
        gammas = (-ts/tau).exp() # H
        # print(us_V(st.contiguous()).shape)
        V_st_gam = us_V(st.contiguous())[:,1:,0] * gammas[1:] # L,N,H-1
        # print(V_st_gam.shape)
        V_const = min(rounds/5.0,1)
        # print((V_const*V_st_gam).shape)
        # print(rt[:,1:].shape)
        n_step_returns = rt[:,1:] + V_const*V_st_gam # ---> n_step_returns[:,:,k] is the sum in (5)
        # print("nstep",n_step_returns.shape)
        optimized_returns = n_step_returns.mean(-1) # L,N
        # print("optimized",optimized_returns.shape)
        # print(optimized_returns)
        mean_cost = -optimized_returns.mean()
        # print(mean_cost.shape)
        # print(mean_cost)
        mean_cost.backward()
        grad_norm = torch.norm(flatten_([p.grad for p in policy_nn.parameters()])).item()
        policy_opt.step()

        rewards.append(rew_int.mean().item())
        opt_objs.append(mean_cost.mean().item())
        print_log = 'Round: {:4d}/{:<4d}, Iter:{:4d}/{:<4d},  opt. target:{:.3f}  mean reward:{:.3f}  '\
                .format(rounds,num_rounds, episode, num_episodes, np.mean(opt_objs), np.mean(rewards)) + \
                'H={:.2f},  grad_norm={:.3f},  '.format(2.0,grad_norm)
        wandb.log({"round":rounds, "episode":episode, "opt. target":np.mean(opt_objs), "mean reward":np.mean(rewards), "grad_norm":grad_norm})
        # if rounds > 0:
        #     if np.mean(rewards) > 0.15 and flag:
        #         policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
        #         us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)
        #         flag = False

        with torch.no_grad():
            # regress all intermediate values
            # print("Regressing all intermediate values")
            # print(st.detach().contiguous().shape)
            last_states = st.detach().contiguous()[:,1:,:] # L,N,T-1,n
            # print("last",last_states.shape)
            last_values = Vtarget(last_states).squeeze(-1)
            # print("last_val",last_values.shape)
            # print(((-ts[1:]/tau).exp()*last_values ).shape)
            # print((rt[:,1:,:].squeeze()).shape)
            Vtargets = rt[:,1:].squeeze() + (-ts[1:]/tau).exp()*last_values # L,N,T-1
            # print("Vvv",Vtargets.shape)
            Vtargets = Vtargets.mean(-1)
            # print(Vtargets.shape)
        mean_val_err = 0


        for inner_iter in range(10):
            us_V_opt.zero_grad()
            # print("USV",us_V(s0).squeeze(-1).shape)
            # print(s0.shape)
            td_error = us_V(s0).squeeze(-1) - Vtargets # L,N
            td_error = torch.mean(td_error**2)
            # print(td_error)
            td_error.backward()
            mean_val_err += td_error.item() / 10
            if inner_iter==0:
                first_val_err = td_error.item()
            us_V_opt.step()

        if episode%(num_episodes//5)==0:
            print(print_log)
            # Save the model indicate rounds and and episode
            if not os.path.exists('models/'):
                os.makedirs('models/')

            if not os.path.exists('models/policy8'):
                os.makedirs('models/policy8')
            
            if not os.path.exists('models/value8'):
                os.makedirs('models/value8')

            

            print("Saving model at round {} and episode {}".format(rounds, episode))
            torch.save(policy_nn.state_dict(), 'models/policy8/round_{}_episode_{}.pt'.format(rounds, episode))
            torch.save(us_V.state_dict(), 'models/value8/round_{}_episode_{}.pt'.format(rounds, episode))



    with torch.no_grad():
        Htest,Ntest,Tup = 30,10,int(3.0/0.1)
        initial_observations = [env.reset() for _ in range(10)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        test_states, test_actions, test_rewards,_ = env.integrate_system(T=200, s0=s0, g=policy_nn)
        
        true_test_rewards = test_rewards[...,Tup:].mean().item()
        print("True test rewards: ", true_test_rewards)
        wandb.log({"true test rewards":true_test_rewards})
        
        if true_test_rewards > 0.9:
            print("Test rewards > 0.9. Training complete...")

            break

        env.close()
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbavci[0m. Use [1m`wandb login --relogin`[0m to force relogin




Round:    0/50  , Iter:   0/250 ,  opt. target:-0.192  mean reward:0.069  H=2.00,  grad_norm=0.896,  
Saving model at round 0 and episode 0
Round:    0/50  , Iter:  50/250 ,  opt. target:-0.233  mean reward:0.221  H=2.00,  grad_norm=0.070,  
Saving model at round 0 and episode 50
Round:    0/50  , Iter: 100/250 ,  opt. target:-0.235  mean reward:0.228  H=2.00,  grad_norm=0.014,  
Saving model at round 0 and episode 100
Round:    0/50  , Iter: 150/250 ,  opt. target:-0.235  mean reward:0.231  H=2.00,  grad_norm=0.015,  
Saving model at round 0 and episode 150
Round:    0/50  , Iter: 200/250 ,  opt. target:-0.236  mean reward:0.232  H=2.00,  grad_norm=0.015,  
Saving model at round 0 and episode 200
True test rewards:  0.23690665922598922
Round:    1/50  , Iter:   0/250 ,  opt. target:-0.302  mean reward:0.237  H=2.00,  grad_norm=0.027,  
Saving model at round 1 and episode 0
Round:    1/50  , Iter:  50/250 ,  opt. target:-0.308  mean reward:0.237  H=2.00,  grad_norm=0.051,  
Saving mode

[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 2.1 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 4.2 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 9.1 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 17.1 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 32.1 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 64.9 seconds.), retrying request


Round:   46/50  , Iter: 100/250 ,  opt. target:-0.462  mean reward:0.819  H=2.00,  grad_norm=1.646,  
Saving model at round 46 and episode 100


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 134.9 seconds.), retrying request
[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 293.4 seconds.), retrying request


Round:   46/50  , Iter: 150/250 ,  opt. target:-0.463  mean reward:0.824  H=2.00,  grad_norm=0.304,  
Saving model at round 46 and episode 150
Round:   46/50  , Iter: 200/250 ,  opt. target:-0.464  mean reward:0.826  H=2.00,  grad_norm=0.229,  
Saving model at round 46 and episode 200


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 368.6 seconds.), retrying request


True test rewards:  0.2513452388659245
Round:   47/50  , Iter:   0/250 ,  opt. target:-0.474  mean reward:0.835  H=2.00,  grad_norm=0.740,  
Saving model at round 47 and episode 0


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 346.8 seconds.), retrying request


Round:   47/50  , Iter:  50/250 ,  opt. target:-0.490  mean reward:0.868  H=2.00,  grad_norm=1.361,  
Saving model at round 47 and episode 50
Round:   47/50  , Iter: 100/250 ,  opt. target:-0.486  mean reward:0.877  H=2.00,  grad_norm=0.234,  
Saving model at round 47 and episode 100


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 373.6 seconds.), retrying request


Round:   47/50  , Iter: 150/250 ,  opt. target:-0.476  mean reward:0.870  H=2.00,  grad_norm=1.104,  
Saving model at round 47 and episode 150


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 363.8 seconds.), retrying request


Round:   47/50  , Iter: 200/250 ,  opt. target:-0.472  mean reward:0.871  H=2.00,  grad_norm=0.255,  
Saving model at round 47 and episode 200
True test rewards:  0.25208232945612263
Round:   48/50  , Iter:   0/250 ,  opt. target:-0.439  mean reward:0.877  H=2.00,  grad_norm=0.611,  
Saving model at round 48 and episode 0


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 354.7 seconds.), retrying request


Round:   48/50  , Iter:  50/250 ,  opt. target:-0.468  mean reward:0.881  H=2.00,  grad_norm=1.004,  
Saving model at round 48 and episode 50


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 360.1 seconds.), retrying request


Round:   48/50  , Iter: 100/250 ,  opt. target:-0.483  mean reward:0.874  H=2.00,  grad_norm=0.752,  
Saving model at round 48 and episode 100
Round:   48/50  , Iter: 150/250 ,  opt. target:-0.480  mean reward:0.863  H=2.00,  grad_norm=0.278,  
Saving model at round 48 and episode 150


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 306.5 seconds.), retrying request


Round:   48/50  , Iter: 200/250 ,  opt. target:-0.476  mean reward:0.860  H=2.00,  grad_norm=0.602,  
Saving model at round 48 and episode 200


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 328.5 seconds.), retrying request


True test rewards:  0.29392387881659626
Round:   49/50  , Iter:   0/250 ,  opt. target:-0.440  mean reward:0.869  H=2.00,  grad_norm=0.148,  
Saving model at round 49 and episode 0


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 310.2 seconds.), retrying request


Round:   49/50  , Iter:  50/250 ,  opt. target:-0.467  mean reward:0.883  H=2.00,  grad_norm=0.761,  
Saving model at round 49 and episode 50
Round:   49/50  , Iter: 100/250 ,  opt. target:-0.470  mean reward:0.873  H=2.00,  grad_norm=0.578,  
Saving model at round 49 and episode 100


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 341.5 seconds.), retrying request


Round:   49/50  , Iter: 150/250 ,  opt. target:-0.468  mean reward:0.882  H=2.00,  grad_norm=0.390,  
Saving model at round 49 and episode 150


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 328.4 seconds.), retrying request


Round:   49/50  , Iter: 200/250 ,  opt. target:-0.465  mean reward:0.886  H=2.00,  grad_norm=0.640,  
Saving model at round 49 and episode 200


[34m[1mwandb[0m: 429 encountered (Filestream rate limit exceeded, retrying in 335.8 seconds.), retrying request


True test rewards:  0.27148873619901603


wandb: ERROR Dropped streaming file chunk (see wandb/debug-internal.log)


0,1
episode,▅▅▅▂▂▆▆▂▃▇▇▃▄▇██▄▅▁▁▅▅▂▂▆▆▃▃▃▇▇▄▄█▁▅▅▂▂▆
grad_norm,▁▁▁▁▁▃▂▁▂▁▁▁▂▁▂▂▂▂▃▃▄▂▂▁▃▂▁▂▂▃▃▂▆▄▁▂▃█▁▆
mean reward,▁▁▁▇▇██████▇██████▇▇██▇▇███▇▇▇▇▆▇▇▇▇▇▇▇█
opt. target,█▇▆▁▁▁▁▂▂▃▄▅▅▆▆▆▆▆▆▆▆▆▇▆▆▆▆▆▆▆▆▆▆▆▅▅▅▅▅▅
round,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
true test rewards,▇▆▁▁▃▃▂▂▂▂▁▁▁▁▂▂▇▁▂▂▅▄▆▄▄▄▃▃▄▅▆▇▅▆▆▆████

0,1
episode,249.0
grad_norm,0.47404
mean reward,0.88935
opt. target,-0.46409
round,49.0
true test rewards,0.27149


In [16]:
import time
def discount_rewards(r, gamma):
    discounted_r = torch.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size(-1))):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# random seed

# Set seed for reproducibility
device = env.device

# Initialize policy network
policy_nn = Policy(env)
policy_nn.to(device)

# Initialize value network
us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.reset_parameters()
us_V.to(device)


rounds = 10
episode = 200
policy_nn = Policy(env)
policy_nn.load_state_dict(torch.load('models/policy10/round_{}_episode_{}.pt'.format(rounds, episode)))
policy_nn.to(device)
policy_nn.train()

us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.load_state_dict(torch.load('models/value10/round_{}_episode_{}.pt'.format(rounds, episode)))
us_V.to(device)
us_V.train()
# Initialize optimizers
policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)

# Training parameters
num_episodes =250
num_steps = 20
num_rounds = 50
gamma = 0.99
tau = 5.0

experience_buffer = []
flag = True

for rounds in range(num_rounds):

    # if mean_reward > 0.2 decreased the learning rate


    rewards,opt_objs = [],[]
    for episode in range(num_episodes):
        if episode%100==0:
            Vtarget = copy.deepcopy(us_V)


        import random

        # Draw initial states from previous experiences
        if len(experience_buffer) >= 50:
            initial_observations = random.sample(experience_buffer, 50)

        else:
            # If buffer is empty or not enough experiences, reset the environment
            initial_observations = [env.reset() for _ in range(50)]

        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        ts = env.build_time_grid(num_steps).to(device)
        policy_opt.zero_grad()

        st, at, rt, ts  = env.integrate_system(T=num_steps, g=policy_nn, s0=s0, N=1)

        rew_int  = rt[:,-1].mean(0)  # N
        # print(rew_int.shape)
        st_contiguous = st.contiguous()

        # Reshape the tensor to combine the first two dimensions
        st_flattened = st_contiguous.reshape(-1, st.size(-1))  # This results in shape (1000, 4)

        # Randomly sample 50 states from the flattened tensor
        indices = torch.randperm(st_flattened.size(0))[:50]
        random_states = st_flattened[indices]
        experience_buffer.extend(random_states.detach().cpu().numpy())
        
        # print(st.shape)
        st = torch.cat([st]*5) if st.shape[0]==1 else st
        # print(st.shape)
        ts = ts[0]
        gammas = (-ts/tau).exp() # H
        # print(us_V(st.contiguous()).shape)
        V_st_gam = us_V(st.contiguous())[:,1:,0] * gammas[1:] # L,N,H-1
        # print(V_st_gam.shape)
        V_const = min(rounds/5.0,1)
        # print((V_const*V_st_gam).shape)
        # print(rt[:,1:].shape)
        n_step_returns = rt[:,1:] + V_const*V_st_gam # ---> n_step_returns[:,:,k] is the sum in (5)
        # print("nstep",n_step_returns.shape)
        optimized_returns = n_step_returns.mean(-1) # L,N
        # print("optimized",optimized_returns.shape)
        # print(optimized_returns)
        mean_cost = -optimized_returns.mean()
        # print(mean_cost.shape)
        # print(mean_cost)
        mean_cost.backward()
        grad_norm = torch.norm(flatten_([p.grad for p in policy_nn.parameters()])).item()
        policy_opt.step()

        rewards.append(rew_int.mean().item()/2.0)
        opt_objs.append(mean_cost.mean().item())
        print_log = 'Round: {:4d}/{:<4d}, Iter:{:4d}/{:<4d},  opt. target:{:.3f}  mean reward:{:.3f}  '\
                .format(rounds,num_rounds, episode, num_episodes, np.mean(opt_objs), np.mean(rewards)) + \
                'H={:.2f},  grad_norm={:.3f},  '.format(2.0,grad_norm)
        
        # if rounds > 0:
        #     if np.mean(rewards) > 0.15 and flag:
        #         policy_opt = optim.Adam(policy_nn.parameters(), lr=0.001)
        #         us_V_opt = optim.Adam(us_V.parameters(), lr=0.001)
        #         flag = False

        with torch.no_grad():
            # regress all intermediate values
            # print("Regressing all intermediate values")
            # print(st.detach().contiguous().shape)
            last_states = st.detach().contiguous()[:,1:,:] # L,N,T-1,n
            # print("last",last_states.shape)
            last_values = Vtarget(last_states).squeeze(-1)
            # print("last_val",last_values.shape)
            # print(((-ts[1:]/tau).exp()*last_values ).shape)
            # print((rt[:,1:,:].squeeze()).shape)
            Vtargets = rt[:,1:].squeeze() + (-ts[1:]/tau).exp()*last_values # L,N,T-1
            # print("Vvv",Vtargets.shape)
            Vtargets = Vtargets.mean(-1)
            # print(Vtargets.shape)
        mean_val_err = 0


        for inner_iter in range(10):
            us_V_opt.zero_grad()
            # print("USV",us_V(s0).squeeze(-1).shape)
            # print(s0.shape)
            td_error = us_V(s0).squeeze(-1) - Vtargets # L,N
            td_error = torch.mean(td_error**2)
            # print(td_error)
            td_error.backward()
            mean_val_err += td_error.item() / 10
            if inner_iter==0:
                first_val_err = td_error.item()
            us_V_opt.step()

        if episode%(num_episodes//5)==0:
            print(print_log)
            # Save the model indicate rounds and and episode
            if not os.path.exists('models/'):
                os.makedirs('models/')

            if not os.path.exists('models/policy11'):
                os.makedirs('models/policy11')
            
            if not os.path.exists('models/value11'):
                os.makedirs('models/value11')

            

            print("Saving model at round {} and episode {}".format(rounds, episode))
            torch.save(policy_nn.state_dict(), 'models/policy11/round_{}_episode_{}.pt'.format(rounds, episode))
            torch.save(us_V.state_dict(), 'models/value11/round_{}_episode_{}.pt'.format(rounds, episode))



    with torch.no_grad():
        Htest,Ntest,Tup = 30,10,int(3.0/0.1)
        initial_observations = [env.reset() for _ in range(10)]
        s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
        test_states, test_actions, test_rewards,_ = env.integrate_system(T=300, s0=s0, g=policy_nn)
        
        true_test_rewards = test_rewards[...,Tup:].mean().item()
        print("True test rewards: ", true_test_rewards)
        
        if true_test_rewards > 0.9:
            print("Test rewards > 0.9. Training complete...")

            break

        env.close()



Round:    0/50  , Iter:   0/250 ,  opt. target:0.004  mean reward:0.000  H=2.00,  grad_norm=0.010,  
Saving model at round 0 and episode 0
Round:    0/50  , Iter:  50/250 ,  opt. target:-0.856  mean reward:0.479  H=2.00,  grad_norm=0.319,  
Saving model at round 0 and episode 50
Round:    0/50  , Iter: 100/250 ,  opt. target:-0.905  mean reward:0.487  H=2.00,  grad_norm=0.977,  
Saving model at round 0 and episode 100
Round:    0/50  , Iter: 150/250 ,  opt. target:-0.927  mean reward:0.490  H=2.00,  grad_norm=0.218,  
Saving model at round 0 and episode 150
Round:    0/50  , Iter: 200/250 ,  opt. target:-0.940  mean reward:0.492  H=2.00,  grad_norm=0.231,  
Saving model at round 0 and episode 200
True test rewards:  0.6188837560979294
Round:    1/50  , Iter:   0/250 ,  opt. target:-1.571  mean reward:0.498  H=2.00,  grad_norm=0.920,  
Saving model at round 1 and episode 0
Round:    1/50  , Iter:  50/250 ,  opt. target:-1.627  mean reward:0.498  H=2.00,  grad_norm=0.217,  
Saving model 

In [22]:
# Run saved policy on the environment
import time

rounds = 18
episode = 200
policy_nn = Policy(env)
policy_nn.load_state_dict(torch.load('models/policy10/round_{}_episode_{}.pt'.format(rounds, episode)))
policy_nn.to(device)
policy_nn.eval()

us_V = basic_mdl(env.n, 1, n_hid_layers=2, act="tanh", n_hidden=200)
us_V.load_state_dict(torch.load('models/value10/round_{}_episode_{}.pt'.format(rounds, episode)))
us_V.to(device)
us_V.eval()
1


with torch.no_grad():
    Htest,Ntest,Tup = 30,10,int(3.0/0.1)
    initial_observations = [env.reset() for _ in range(10)]
    s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
    test_states,test_actions, test_rewards,_ = env.integrate_system(T=200, s0=s0, g=policy_nn)
    # print(test_rewards)
    # print(test_actions)
    true_test_rewards = test_rewards[...,Tup:].mean().item()
    print('True test reward: {:.3f}'.format(true_test_rewards))
    for step in range(200):
        observation = test_states[0, step].cpu().numpy()
        env.set_state_(observation)
        time.sleep(0.01)

        img = env.render(mode='rgb_array')


    env.close()

True test reward: 0.170


In [7]:
# To create a video of evaluation of trained policy run all the saved policies in a for loop and make a video

import time
import imageio
import pickle

device = 'cuda'

total_rounds = 18
total_episodes = 250

# episodes has 0 20 40 60 80


for rounds in range(total_rounds):
    for episode in range(0, total_episodes,200):

        policy_nn = Policy(env)
        policy_nn.load_state_dict(torch.load('models/policy10/round_{}_episode_{}.pt'.format(rounds, episode)))
        policy_nn.to(device)
        policy_nn.eval()

        with torch.no_grad():
            Htest,Ntest,Tup = 30,10,int(3.0/0.1)
            initial_observations = [env.reset() for _ in range(10)]
            s0 = torch.stack([env.obs2state(torch.tensor(obs, device=device)) for obs in initial_observations])
            test_states,test_actions, test_rewards,_ = env.integrate_system(T=300, s0=s0, g=policy_nn)
            # print(test_rewards)
            # print(test_actions)
            true_test_rewards = test_rewards[...,Tup:].mean().item()
            print('True test reward: {:.3f}'.format(true_test_rewards))
            #save test states, actions and rewards pkl
            with open('test_states_actions_rewards_acrobot.pkl', 'wb') as f:
                pickle.dump([test_states, test_actions, test_rewards], f)
            
            images = []
            for step in range(300):
                observation = test_states[0, step].cpu().numpy()
                env.set_state_(observation)
                img = env.render(mode='rgb_array')

                # Add text overlay directly to the image array
                fig, ax = plt.subplots()
                ax.imshow(img)
                ax.text(10, 30, f'Round: {rounds}, Episode: {episode}', color='white', fontsize=12, 
                        bbox=dict(facecolor='black', alpha=0.5))
                ax.axis('off')
                
                fig.canvas.draw()
                img_with_text = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
                img_with_text = img_with_text.reshape(fig.canvas.get_width_height()[::-1] + (3,))
                
                images.append(img_with_text)
                plt.close(fig)

            if not os.path.exists('video_acrobot'):
                os.makedirs('video_acrobot/')

            


            imageio.mimsave('video_acrobot/round_{}_episode_{}.mp4'.format(rounds, episode), images)
            env.close()

True test reward: -0.001
True test reward: 0.000
True test reward: -0.000
True test reward: -0.005
True test reward: -0.005
True test reward: -0.003
True test reward: -0.005
True test reward: 0.015
True test reward: 0.012
True test reward: 0.022
True test reward: 0.021
True test reward: 0.041
True test reward: 0.035
True test reward: 0.047
True test reward: 0.063
True test reward: 0.066
True test reward: 0.047
True test reward: 0.043
True test reward: 0.049
True test reward: 0.035
True test reward: 0.057
True test reward: 0.105
True test reward: 0.120
True test reward: 0.444
True test reward: 0.588
True test reward: 0.710
True test reward: 0.526
True test reward: 0.629
True test reward: 0.607
True test reward: 0.759
True test reward: 0.772
True test reward: 0.491
True test reward: 0.736
True test reward: 0.037
True test reward: 0.029
True test reward: 0.995


In [None]:
import time
import torch
import matplotlib.pyplot as plt
import numpy as np

total_rounds = 1
total_episodes = 250

# Initialize lists to store x-axis values, mean rewards, and std deviations
x_axis_values = []
mean_rewards_list = []
std_rewards_list = []

# Loop over rounds and episodes
for rounds in range(total_rounds):
    for episode in range(0, total_episodes, 200):

        policy_nn = Policy(env)
        policy_nn.load_state_dict(torch.load('models/policy10/round_{}episode{}.pt'.format(rounds, episode)))
        policy_nn.to(device)
        policy_nn.eval()

        us_V = basic_mdl(env.n, env.m, n_hid_layers=2, act="tanh", n_hidden=200)
        us_V.load_state_dict(torch.load('models/value10/round_{}episode{}.pt'.format(rounds, episode)))
        us_V.to(device)
        us_V.eval()

        # List to collect rewards for different initial observations
        rewards = []

        with torch.no_grad():
            Htest,Ntest,Tup = 30,10,int(3.0/0.1)
            initial_observations = [env.reset() for _ in range(10)]

            for obs in initial_observations:
                s0 = env.obs2state(torch.tensor(obs, device=device)).unsqueeze(0)
                test_states, test_actions, test_rewards, _ = env.integrate_system(T=400, s0=s0, g=policy_nn)
                true_test_rewards = test_rewards[...,Tup:].mean().item()
                rewards.append(true_test_rewards)

            # Calculate mean and std deviation of rewards
            mean_reward = np.mean(rewards)
            std_reward = np.std(rewards)

            # Store the results
            x_value = rounds + (episode / total_episodes)
            x_axis_values.append(x_value)
            mean_rewards_list.append(mean_reward)
            std_rewards_list.append(std_reward)

            print(f'Round: {rounds}, Episode: {episode}, X-axis Value: {x_value:.3f}, Mean Reward: {mean_reward:.3f}, Std Dev: {std_reward:.3f}')

# Convert lists to numpy arrays for plotting
x_axis_values = np.array(x_axis_values)
mean_rewards_list = np.array(mean_rewards_list)
std_rewards_list = np.array(std_rewards_list)

# Define a soft blue color
soft_blue = '#87CEEB'  # Light sky blue

# Plotting the mean test rewards with shaded area for std deviation
plt.figure(figsize=(12, 6))
plt.plot(x_axis_values, mean_rewards_list, linestyle='-', color=soft_blue, label='Mean Test Rewards')
plt.fill_between(x_axis_values, 
                 mean_rewards_list - std_rewards_list, 
                 mean_rewards_list + std_rewards_list, 
                 color=soft_blue, alpha=0.3)

# Customizing the plot
plt.xlabel("Rounds")
plt.ylabel('Mean Test Rewards')
plt.title("Test Performance")
plt.xticks(ticks=range(total_rounds + 1), labels=[f'{i}' for i in range(total_rounds + 1)])
plt.legend()
plt.grid(True)

# Show the plot
plt.show()