In [1]:
import torch
import numpy as np
import gym
import math
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.distributions.relaxed_bernoulli import RelaxedBernoulli
from torch.distributions import Bernoulli
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.autograd import Variable

from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.distributions import Normal

from torch.autograd import Variable

import seaborn as sns

import time

  import pandas.util.testing as tm


#some tools

In [2]:
def padding_tensor(sequences):
    """
    :param sequences: list of tensors with shape [seq, state dim]
    :return: tensor with shape [num, max_seq_length, state dim]
    """
    num = len(sequences)
    max_len = max([s.size(0) for s in sequences])
    feature_dim = sequences[0].size(-1)
    out_dims = (num, max_len, feature_dim)

    out_tensor = sequences[0].data.new(*out_dims).fill_(0)

    mask = sequences[0].data.new(*out_dims).fill_(0)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        out_tensor[i, :length, :] = tensor
        mask[i, :length,:] = 1
    return out_tensor, mask


def truncate_sequence(list_of_sequence, batch_first, min_len=None):
    """
    list_of_sequence: list of tensor with shape [seq, feature dim]
    return : tensor with shape [min_seq_length, batch, fea_dim] or [batch, min_seq_length, fea_dim] if batch_first
    """
    feature_dim = list_of_sequence[0].size(-1)
    if min_len is None:
        min_len = min([s.size(0) for s in list_of_sequence])

    container = torch.zeros(len(list_of_sequence), min_len, feature_dim)
    for i in range(len(list_of_sequence)):
        #random truncation
        #start = np.random.choice((list_of_sequence[i].size(0) - min_len +1))
        #container[i] = list_of_sequence[i][start : start+min_len, :]
        container[i] = list_of_sequence[i][:min_len,:]
    
    if batch_first:
        return container
    else:
        return container.permute(1,0,2)

def plot_DRNN(true_obs, pred_DRNN):
    


    position = true_obs[1:,0,0].data.numpy()
    velocity = true_obs[1:,0,1].data.numpy()
    angle = true_obs[1:,0,2].data.numpy()
    angle_v = true_obs[1:,0,3].data.numpy()

    #DRNN
    pred = torch.cat(pred_DRNN)
    position_pred = pred[:,0].data.cpu().numpy()
    velocity_pred = pred[:,1].data.cpu().numpy()
    angle_pred = pred[:,2].data.cpu().numpy()
    angle_velocity_pred = pred[:,3].data.cpu().numpy()



    fig, ax = plt.subplots(1,4, figsize = (20,5))
    x = np.arange(0, int(position.shape[0]))


    ax[0].plot(position, label = 'True')
    ax[0].plot(position_pred, label = 'Pred')
    ax[0].set_title('Position');
    ax[0].legend();

    ax[1].plot(velocity, label = 'True')
    ax[1].plot(velocity_pred, label = 'Pred')
    ax[1].set_title('Velocity')
    ax[1].legend();

    ax[2].plot(angle, label = 'True')
    ax[2].plot(angle_pred, label = 'Pred')
    ax[2].set_title('Angle')
    ax[2].legend();

    ax[3].plot(angle_v, label = 'True')
    ax[3].plot(angle_velocity_pred, label = 'Pred')
    ax[3].set_title('Angle velocity')
    ax[3].legend()

def save_model_policy(model, model_optimiser, policy, policy_optimiser, save_model_path, save_policy_path):
    save_model_path = save_model_path + "/model.tar"
    save_policy_path = save_policy_path + "/policy.tar" 
    torch.save({
        "model_dict": model.state_dict(),
        "trainer_dict": model_optimiser.state_dict()
    }, save_model_path)

    torch.save({
        "model_dict": policy.state_dict(),
        "trainer_dict": policy_optimiser.state_dict()
    }, save_policy_path)



    print('Checkpointed')


#Cart-pole balancing env

In [3]:
# -*- coding: utf-8 -*-
"""
Classic cart-pole system implemented by Rich Sutton et al.
Copied from http://incompleteideas.net/sutton/book/code/pole.c
permalink: https://perma.cc/C9ZM-652R
Modified by Aaditya Ravindran to include friction and random sensor & actuator noise
"""

import logging
import math
import random
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np

logger = logging.getLogger(__name__)

class CartPoleModEnv(gym.Env):
    metadata = {
            'render.modes': ['human', 'rgb_array'],
            'video.frames_per_second' : 50
    }

    def __init__(self,case):
        self.__version__ = "0.2.0"
        print("CartPoleModEnv - Version {}, Noise case: {}".format(self.__version__,case))
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5 # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.seed()

        self.origin_case = case
        if case<6:          #only model  noise
            self.force_mag = 30.0*(1+self.addnoise(case))
            self.case = 1
        elif case>9:    #both model and data noise
            self.force_mag = 30.0*(1+self.addnoise(case))
            self.case = 10
        else:               #only data noise
            self.force_mag = 30.0
            self.case = case
            
        self.tau = 0.02     # seconds between state updates

        self.min_action = -1.
        self.max_action = 1.0


		# Angle at which to fail the episode
        self.theta_threshold_radians = 12 * 2 * math.pi / 360
        self.x_threshold = 2.4

        # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
        high = np.array([
            self.x_threshold * 2,
            np.finfo(np.float32).max,
            self.theta_threshold_radians * 2,
            np.finfo(np.float32).max])

        #self.action_space = spaces.Discrete(2) # AA Set discrete states back to 2
        self.action_space = spaces.Box(
                low = self.min_action,
                high = self.max_action,
                shape = (1,) 
        )

        self.observation_space = spaces.Box(-high, high)

        self.viewer = None
        self.state = None

        self.steps_beyond_done = None

    def addnoise(self,x):
        return {
        1 : 0,
        2 : self.np_random.uniform(low=-0.05, high=0.05, size=(1,)), #  5% actuator noise ,  small model uniform noise
        3 : self.np_random.uniform(low=-0.10, high=0.10, size=(1,)), # 10% actuator noise ,  large model uniform noise
        4 : self.np_random.normal(loc=0, scale=np.sqrt(0.10), size=(1,)),                  # small model gaussian noise
        5 : self.np_random.normal(loc=0, scale=np.sqrt(0.50), size=(1,)),                 #  large model gaussian noise
        6 : self.np_random.uniform(low=-0.05, high=0.05, size=(1,)), #  5% sensor noise ,    small data uniform noise
        7 : self.np_random.uniform(low=-0.10, high=0.10, size=(1,)), # 10% sensor noise ,    large data uniform noise
        8 : self.np_random.normal(loc=0, scale=np.sqrt(0.10), size=(1,)), # 0.1              small data gaussian noise
        9 : self.np_random.normal(loc=0, scale=np.sqrt(0.20), size=(1,)), # 0.2              large data gaussian noise
        10: self.np_random.normal(loc = 0, scale = np.sqrt(0.10), size = (1,)),           #  small both gaussian noise
        11: self.np_random.normal(loc = 0, scale = np.sqrt(0.50), size = (1,)),          #    large both gaussian noise
        }.get(x,1)

    def seed(self, seed=None): # Set appropriate seed value
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def stepPhysics(self, force):
        x, x_dot, theta, theta_dot = self.state
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / \
                    (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        #noise = self.addnoise(self.case) 
        x  = (x + self.tau * x_dot)
        x_dot = (x_dot + self.tau * xacc)
        theta = (theta + self.tau * theta_dot)#*(1 + noise)
        theta_dot = (theta_dot + self.tau * thetaacc)
        return (x, x_dot, theta, theta_dot)


    def step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        force = self.force_mag * float(action)
        self.state = self.stepPhysics(force)
        x, x_dot, theta, theta_dot = self.state         #true state

        #adding measurement noisy to theta
        noise = self.addnoise(self.case)
        theta = theta * (1+noise)
        noise = self.addnoise(self.case)
        x = x * (1+noise)
        noise = self.addnoise(self.case)
        x_dot = x_dot*(1+noise)
        noise = self.addnoise(self.case)
        theta_dot = theta_dot*(1+noise)

        output_state = (x, x_dot, theta, theta_dot) 
        output_state = np.array(output_state)  


        done = x < -self.x_threshold \
            or x > self.x_threshold \
            or theta < -self.theta_threshold_radians \
            or theta > self.theta_threshold_radians
        done = bool(done)

        if not done:
            reward = 1.0
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warn("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0

        #return np.array(self.state), reward, done, {}
        return output_state, reward, done, {}


    def reset(self):
        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        self.steps_beyond_done = None

        #also reset the force
        if self.origin_case<6:          #only model  noise
            self.force_mag = 30.0*(1+self.addnoise(self.origin_case))
        elif self.origin_case>9:    #both model and data noise
            self.force_mag = 30.0*(1+self.addnoise(self.origin_case))
        else:               #only data noise
            self.force_mag = 30.0

        return np.array(self.state)

    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        screen_width = 600
        screen_height = 400

        world_width = self.x_threshold*2
        scale = screen_width/world_width
        carty = 100 # TOP OF CART
        polewidth = 10.0
        polelen = scale * 1.0
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
            axleoffset =cartheight/4.0
            cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
            pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            pole.set_color(.8,.6,.4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth/2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5,.5,.8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0,carty), (screen_width,carty))
            self.track.set_color(0,0,0)
            self.viewer.add_geom(self.track)

        if self.state is None: return None

        x = self.state
        cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])
        return self.viewer.render(return_rgb_array = mode=='rgb_array')

#DRNN

In [4]:
class DRNN(nn.Module):
    def __init__(self, action_dim, hidden_dim, output_dim, device, mode):
        super(DRNN, self).__init__()

        self.mode = mode

        self.init_encoder = nn.Sequential(
            nn.Linear(output_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.Tanh()
        )

        if mode == 'RNN':
            self.recurrent = nn.RNN(action_dim, hidden_dim)
        elif mode == 'LSTM':
            self.recurrent = nn.LSTM(action_dim, hidden_dim)
        elif mode == 'GRU':
            self.recurrent = nn.GRU(action_dim, hidden_dim)
        else:
            raise ValueError('Mode must be one of RNN, LSTM and GRU')

        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )
        self.device = device

    def forward(self, init_x, actions):
        """
        init_x : [batch, output_dim]
        actions : [seq, batch, input_dim]
        """

        init_x = init_x.to(self.device)
        actions = actions.to(self.device)

        init_h = self.init_encoder(init_x).unsqueeze(0).to(self.device)     #[1, batch, hidden]

        #print('action dim=', actions.shape, 'init_h dim=',init_h.shape)
        if self.mode == 'LSTM':
            init_c = torch.zeros_like(init_h).to(self.device)           #zero inital cell state
            recurrent_states = self.recurrent(actions, (init_h, init_c))[0]
        else:
            recurrent_states = self.recurrent(actions, init_h)[0]       #list of rnn hidden state


        output_list = []
        for h in recurrent_states:
            temp_out = self.decoder(h.squeeze(0))       #[batch, output_dim]

            #print('temp_out dim', temp_out.shape)
            output_list.append(temp_out.unsqueeze(0))
        
        return output_list

    def forward2(self, init_x, actions):
        init_x = init_x.to(self.device)
        actions = actions.to(self.device)
        init_h = self.init_encoder(init_x).unsqueeze(0).to(self.device)
        prediction_obs = []

        previous_h = init_h
        for t in range(actions.size(0)):
            previous_a = actions[t]
            current_h = self.recurrent(previous_a.unsqueeze(0), previous_h)[-1]

            temp_out = self.decoder(current_h.squeeze(0))
            prediction_obs.append(temp_out.unsqueeze(0))
            previous_h = current_h
        return prediction_obs

    def imagine(self, init_x, control_f, horizon, plan):
        """
        Given an initial state and the policy function, do model rollout and output sequence of trajectory
        init_x : [batch, output]
        """
        init_x = init_x.unsqueeze(0).to(self.device)       #[1,batch,output]

        init_h = self.init_encoder(init_x).to(self.device)      #[1, batch, hid]
        if self.mode == 'LSTM':
            init_c = torch.zeros_like(init_h).to(self.device)
            previous_c = init_c
        
        previous_x = init_x.squeeze(0)         #[batch, output]
        previous_h = init_h            #[1, batch, hid]
        output_list = []
        action_log_prob_list = []
        action_list = []
        for t in range(horizon):
            if plan == 'pg':
                action_dist = control_f(previous_x)       #[batch, 1]   
                action_samples = action_dist.sample()                      #[batch, 1]
            
                #compute log prob
                action_log_prob = action_dist.log_prob(action_samples)           #[batch, 1]
                action_log_prob_list.append(action_log_prob.unsqueeze(0))           #[1, batch, 1]
            elif plan == 'rp':
                action_samples, _= control_f(previous_x)          #[batch, 1]
                action_log_prob_list = 0
                action_list.append(action_samples)

            if self.mode == 'LSTM':
                next_h, next_c = self.recurrent(action_samples.unsqueeze(0), (previous_h, previous_c))[1]   
            else:
                next_h = self.recurrent(action_samples.unsqueeze(0), previous_h)[1]

            next_x = self.decoder(next_h.squeeze(0))            #[batch, output_dim]
            output_list.append(next_x.unsqueeze(0))             #[1, batch, output_dim]

            previous_h = next_h
            previous_x = next_x         #[batch,. output]
            if self.mode == 'LSTM':
                previous_c = next_c
            


        output_list = torch.cat(output_list)           #[seq-1, batch, output_dim ]
        if plan == 'pg':
            action_log_prob_list = torch.cat(action_log_prob_list)  #[seq-1, batch, 1]


        return output_list, action_list#action_log_prob_list




        """
        #compute cost
        cost = 20*cost_f(output_list).detach()                 #[seq-1, batch, 1]   

        loss = (cost * action_log_prob_list).sum(0)                 #[batch, 1]
        loss = torch.mean(loss)
        loss.backward()

        """
    def validate_by_imagination(self, init_x, control_f, plan):

        init_x = init_x.unsqueeze(0).to(self.device)       #[1,batch,output]

        init_h = self.init_encoder(init_x).to(self.device)      #[1, batch, hid]
        if self.mode == 'LSTM':
            init_c = torch.zeros_like(init_h).to(self.device)
            previous_c = init_c
        
        previous_x = init_x.squeeze(0)         #[batch, output]
        previous_h = init_h            #[1, batch, hid]
    
        reward = 0
        iter = 0
        while True:
            action_samples, _= control_f(previous_x)          #[batch, 1]
            if self.mode == 'LSTM':
                next_h, previous_c = self.recurrent(action_samples.unsqueeze(0), (previous_h, previous_c))[1]   
            else:
                next_h = self.recurrent(action_samples.unsqueeze(0), previous_h)[1]

            next_x = self.decoder(next_h.squeeze(0))            #[batch, output_dim]
            
            reward+=1
            iter+=1
            
            done = next_x[:,0]<-2.4 \
               or next_x[:,0]>2.4 \
               or next_x[:,2]<-12*2*math.pi/360 \
               or next_x[:,2]>12*2*math.pi/360 \
               or iter>=200

            done = bool(done)
            if done:
                break
            previous_h = next_h
            previous_x = next_x 

        return reward



    def imagine_deterministic(self, init_x, control_f, horizon):
        """
        init_x : [batch ,state_dim]
        """

        init_x = init_x.unsqueeze(0).to(self.device)       #[1,batch,output]

        init_h = self.init_encoder(init_x).to(self.device)      #[1, batch, hid]
        if self.mode == 'LSTM':
            init_c = torch.zeros_like(init_h).to(self.device)
            previous_c = init_c
        
        previous_x = init_x         #[1, batch, output]
        previous_h = init_h            #[1, batch, hid]
        output_list = []

        for t in range(horizon):
            
            action = control_f(previous_x.squeeze(0))       #[batch, 1]


            if self.mode == 'LSTM':
                next_h, next_c = self.recurrent(action.unsqueeze(0), (previous_h, previous_c))[1]   
            else:
                next_h = self.recurrent(action.unsqueeze(0), previous_h)[1]

            next_x = self.decoder(next_h.squeeze(0))            #[batch, output_dim]
            output_list.append(next_x.unsqueeze(0))             #[1, batch, output_dim]

            previous_h = next_h
            previous_x = next_x
            if self.mode == 'LSTM':
                previous_c = next_c

        output_list = torch.cat(output_list)           #[seq-1, batch, output_dim ]


        return output_list





#Deterministic controller

In [5]:
class controller(nn.Module):
    def __init__(self, action_dim=1, state_dim=4, deterministic = True,device = 'cuda'):
        super(controller, self).__init__()
        self.action_dim = action_dim
        controller_hid = 16
        self.state_dim  = state_dim
        init_w = 1e-3

        self.linear = nn.Sequential(
            nn.Linear(self.state_dim,controller_hid),
            nn.ReLU(),
            nn.Linear(controller_hid, controller_hid),
            nn.ReLU(),
            nn.Linear(controller_hid, action_dim)
        )




        #if not deterministic:
        #    self.std = 0.1
        
        #self.deterministic = deterministic
        
        #nn.Linear(self.state_dim, self.action_dim)        #output a p_logits for action 1
        self.device = device
        self.optimiser = torch.optim.Adam(self.parameters(), lr = 0.001)

    def forward(self, state):
        """
        Given states input [batch, state_dim],
        """
        state = state.to(self.device)
        #x = F.relu(self.linear1(state))
        #x = F.relu(self.linear2(x))
        a = self.linear(state)
        a =  torch.tanh(a)
        log_pi = 0

        return a, log_pi


        #out_mean = self.linear(state)         #[batch, action_dim]
        
        #if not self.deterministic:
        #    eps = torch.rand_like(out_mean).normal_().to(self.device)           #[batch, action_dim]
        #    out = out_mean + self.std * eps
        #else:
        #    out = out_mean

        #if len(out.shape) == 1:
        #    out = torch.clamp(out, -1, 1)
        #else:
        #    out = torch.clamp(out[:,0], -1, 1).unsqueeze(1)             #[1, batch, 1]

        #return torch.tanh(out)


        #if len(out.shape) == 1:
        #    return torch.clamp(out, -1, 1)
        #else:
        #    clamp_out = torch.clamp(out[:, 0], -1, 1).unsqueeze(-1)
        #    return clamp_out
    
    def make_decision(self, state, behaviour_uncertainty):
        """
        given a state [batch, state_dim], output a action
        """
        state = state.to(self.device)
        #x = F.relu(self.linear1(state))
        #x = F.relu(self.linear2(x))
        a = self.linear(state)
        a = torch.tanh(a)

        return a.detach()

        #out_mean = self.linear(state)
        #if not self.deterministic and behaviour_uncertainty:
        #    eps = torch.rand_like(out_mean).normal_().to(self.device) 
        #    out = out_mean + self.std * eps
        #else:
        #    out = out_mean
        #if len(out.shape) == 1:

        #    out = torch.clamp(out, -1, 1)
        #else:
        #    out = torch.clamp(out[:,0], -1, 1)
        #return out.detach()
        #return torch.tanh(out)
    def pg_train(self, num_epoch, initial_state, horizon, cost_f, model_imagine_f, w_uncertainty, e_uncertainty,gamma = 0.95):
        """
        initial_state : [batch, state_dim]

        """
        loss_list = []
        num_particle = 100
        initial_state = initial_state.expand(num_particle, self.state_dim)

        for e in range(num_epoch):
            output_matrix, action_log_prob_matrix = model_imagine_f(initial_state, self.forward, horizon, plan = 'pg',
                                                                    W_uncertainty = w_uncertainty, e_uncertainty = e_uncertainty)
            cost = cost_f(output_matrix).detach()               #[seq-1, batch, 1]  
            
            cost = cost * torch.tensor([gamma**(t+1) for t in range(cost.size(0))]).unsqueeze(-1).unsqueeze(-1).to(self.device)

            #baseline = torch.mean(cost, dim = 0).unsqueeze(0)       #[1, batch, 1]
            #cost = cost - torch.mean(cost, dim = 0).unsqueeze(0)
            #loss = ((cost-baseline) * action_log_prob_matrix).sum(0)
            loss = cost.sum(0) * action_log_prob_matrix.sum(0) 

            loss = loss.sum()
            loss.backward()
            nn.utils.clip_grad_norm_(self.parameters(), 5)

            self.optimiser.step()
            loss_list.append(loss.item())
            if e%50 == 0:
                print('Epoch = {}; Policy gradient training loss = {}'.format(e, loss.item()))
        return loss_list




    
    def rp_train(self, num_epoch, num_particle,initial_state, horizon , cost_f, model_imagine_f, gamma = 0.9):
        """
        From an initial state, use mode imagination function to make prediction of the next state accordin to the action proposed by
        the controller, we fixed the horizon and compute the total reward of the trajectory, from which the gradient w.r.t policy 
        parameters is taken.
        inital_state: [batch, output_dim]
        """
        loss_list = []
        num_particle = num_particle
        initial_state = initial_state.expand(num_particle, self.state_dim)

        cost_mean_list = []
        cost_std_list = []
        
        for e in range(num_epoch):
            self.optimiser.zero_grad()
            output_matrix, action_matrix= model_imagine_f(initial_state, self.forward, horizon, plan = 'rp') 
            
            self.action_matrix = action_matrix
            self.temp_output_matrix = torch.cat([initial_state.unsqueeze(0).to(self.device), output_matrix], dim = 0)

            cost = cost_f(output_matrix)                 #[seq-1, batch, 1]  

            mean_cost = cost.data.sum(0).mean(0)      #[]
            std_cost = cost.data.sum(0).std(0)
            cost_mean_list.append(mean_cost)
            cost_std_list.append(std_cost)

            #multiply by discount factor
            #cost = cost *  ((torch.arange(cost.size(0)+1,1,-1).float()).unsqueeze(-1).unsqueeze(-1)/cost.size(0)
            #                    ).expand(cost.shape).float().to(self.device)
            
            cost = cost * torch.tensor([gamma**(t+1) for t in range(cost.size(0))]).unsqueeze(-1).unsqueeze(-1).to(self.device)

            loss = cost.sum()      #[batch, 1]
            #loss = torch.exp(action_log_prob_matrix.sum(0)) * cost.sum(0)
            #loss = (cost * action_log_prob_matrix).sum(0)                 #[batch, 1]
            loss.backward()

            nn.utils.clip_grad_norm_(self.parameters(), 1)
            self.optimiser.step()
            loss_list.append(loss.item())
            #print('policy loss = {}', loss.item())
            #print('Epoch = {}; Policy gradient training loss = {}; Cost: mean {} std {}.'.format(e, loss.item()/num_particle,
            #                                                                                        mean_cost.item(), std_cost.item()))


        return loss_list, torch.cat(cost_mean_list), torch.cat(cost_std_list)

    def rp_validate(self, num_particle, initial_state, horizon, cost_f, model_imagine_f, gamma = 1, print_trajectory = False):
        initial_state = initial_state.expand(num_particle, self.state_dim)
        output_matrix, action_matrix=  model_imagine_f(initial_state, self.forward, horizon, plan = 'rp')
        cost = cost_f(output_matrix)

        mean_cost = cost.data.sum(0).mean(0)

        #outputing trajectory
        if print_trajectory:
            mean_output = output_matrix.mean(1)     #[seq-1, output]
            std_output = output_matrix.std(1)
            return mean_output, std_output, action_matrix


        return mean_cost.item()



#Agent

In [6]:
class Agent:
    def __init__(self, env_case,state_dim = 4, action_dim = 1, device='cuda', rand_seed = 1):
        
        self.env =  CartPoleModEnv(case = env_case)
        self.env_case = env_case

        self.state_dim = state_dim
        self.action_dim = action_dim
        self.device = device
        self.observations_list = []
        self.actions_list= []
        self.MSEloss = nn.MSELoss()

        self.model = DRNN(action_dim, 32, state_dim, device, 'LSTM').to(device)
        self.model_optimiser = torch.optim.Adam(self.model.parameters(), lr = 0.001)
        self.mseloss = nn.MSELoss()
        self.model_training_loss_list = []

        self.policy = controller(action_dim, state_dim,device).to(device)
        #np.random.seed(rand_seed)
        #torch.manual_seed(rand_seed)
        #self.env.seed(rand_seed)

    
    def env_rollout(self, if_remember, plan, behaviour_uncertainty):
        """
        interaction with the environment using the current policy. 
        """
        done = False
        state = self.env.reset()
        total_reward = 0
        i = 0
        temp_obs_list = []
        temp_actions_list = []

        if if_remember:
            temp_obs_list.append(torch.tensor(state))
        
        while not done:
            i+=1
            
            if plan == 'random':
                action = self.env.action_space.sample()
            elif plan == 'pg' or 'rp':
                state_tensor = torch.tensor(np.vstack(state)).float().squeeze()

                action = self.policy.make_decision(state_tensor.to(self.device), behaviour_uncertainty)
                action = action.detach().cpu().numpy()
                
            else:
                raise NotImplementedError
            #print('action = ', action)

            next_state, reward, _, _  = self.env.step(action)

            done = next_state[0] < -2.4 \
                or next_state[0] > 2.4 \
                or next_state[2] < -12 * 2 * math.pi / 360 \
                or next_state[2] > 12 * 2 * math.pi / 360 \
                or i >= 200


            if if_remember:

                temp_obs_list.append(torch.tensor(np.vstack(next_state)).squeeze())

                temp_actions_list.append(torch.tensor(action).float())

            
            state = next_state
            total_reward += 1

        if if_remember:
            self.observations_list.append(torch.stack(temp_obs_list).float())       #list of shape [seq, output]
            self.actions_list.append(torch.stack(temp_actions_list).float())        #list of shape [seq-1, 1]

        return total_reward
    
    def model_learning(self, num_epoch, num_batch):
        """
        perform model leanring using data self.observation_list and self.actions_list; since the data has variable length, one could 
        try truncate the data into same length or pack_padded_sequence, but here we would simply train each single sample in a batch,
        and during each epoch, the parameter is only updated once using part of the dataset
        num_epoch : number of training epoch
        num_batch: this is actually number of samples we want the model to be trained on during each epoch
        """



        for e in range(num_epoch):
            self.model_optimiser.zero_grad()

            idx = np.random.choice(len(self.observations_list), num_batch)
            trun_obs = truncate_sequence([self.observations_list[i] for i in idx], batch_first = False)
            trun_actions = truncate_sequence([self.actions_list[j] for j in idx], batch_first = False)

            pred = self.model(trun_obs[0,:,:], trun_actions)
            loss = self.MSEloss(torch.cat(pred), trun_obs[1:,:,:].to('cuda'))
            loss.backward()

            

            #for i in idx:
            #    training_obs = self.observations_list[i].unsqueeze(1)       #[seq, 1, output]
            #    training_actions = self.actions_list[i]                      #[seq-1, 1]

            #    pred = self.model(training_obs[0,:,:], training_actions.unsqueeze(-1))
            #    loss = self.mseloss(torch.cat(pred).unsqueeze(1), training_obs[1:, :, :].to(self.device))
            #    temp_loss += loss
            #temp_loss.backward()

            self.model_optimiser.step()
            self.model_training_loss_list.append(loss.item())

            if e%500 == 0:
                print('Epoch:{}; loss = {}.'.format(e, loss.item()))


    def cost(self, state):
        """
        cost = 5*angle^2 + position^2
        state : [seq, batch, output]
        return [seq, batch, 1]
        """
        return (5*state[:,:,2]**2 + state[:,:,0]**2).unsqueeze(-1)      #[seq, batch, 1]

    '''
    def cost(self, states, sigma=0.25):
        """
        states : [seq, batch, output]
        return : [seq, batch, 1]
        """
        l = 0.6
        seq_length = states.size(0)
        batch_size = states.size(1)
        feature_dim = states.size(-1)
        
        goal = Variable(torch.FloatTensor([0.0, l])).unsqueeze(0).unsqueeze(0).expand(seq_length,1, 2).to(self.device)     #[seq, 1,2]

        # Cart position
        cart_x = states[:,:, 0]         #[seq, batch]
        # Pole angle
        thetas = states[:,:,2]          #[seq, bnatch]
        # Pole position
        x = torch.sin(thetas)*l         #[seq, batch]
        y = torch.cos(thetas)*l
        positions = torch.stack([cart_x + x, y], -1)             #[seq, batch, 2]

        
        squared_distance = torch.sum((goal - positions)**2, -1).unsqueeze(-1)          #[]

        squared_sigma = sigma**2
        cost = 1 - torch.exp(-0.5*squared_distance/squared_sigma)
        
        return cost
    '''




    def policy_learning(self, imagine_num, num_particle, num_epoch, batch_size, horizon, plan, plot = False):
        """
        we utilise the current learned model to do policy learning on imagined data
        num_epoch : number of epochs we want to run our policy gradient for
        batch_size : number of samples we want to train the policy on/ number of initial states

        we creat batch_size number of initial state, the model then rollout for a fixed length(horizon), the sum of cost for each 
        imagined trajectory is computed, from which the gradient is taken w.r.t the policy parameters
        """
        #creat inital states 
        for i in range(imagine_num):
            #initial_state = []
            #for b in range(batch_size):
            #    init_x = self.env.reset()
            #    initial_state.append(torch.tensor(init_x).float())
            #initial_state = torch.stack(initial_state)          #[batch, output]
            initial_state = torch.tensor(self.env.reset()).unsqueeze(0).float()         #[1, output]
            if plot:
                initial_state = torch.zeros_like(initial_state)

            #learn the policy parameter using current model

            model_f = self.model.imagine

            if plan == 'pg':
                policy_train_loss = self.policy.pg_train(num_epoch, num_particle,initial_state, horizon , self.cost, model_f, gamma=1)
            elif plan == 'rp':
                policy_train_loss = self.policy.rp_train(num_epoch, num_particle, initial_state, horizon, self.cost, model_f, gamma = 0.95)
        """
        total_reward=[]
        for i in range(20):
            init_x = torch.tensor(self.env.reset()).unsqueeze(0).float()
            imagine_reward = self.model.validate_by_imagination(init_x, self.policy.forward, plan)
            total_reward.append(imagine_reward)
        mean_reward = np.mean(total_reward)
        std_reward = np.std(total_reward)
        print('Training reward: mean {}, std {}'.format(mean_reward, std_reward))

        return mean_reward, std_reward
        """
        """
        total_cost10 = []
        for i in range(20):
            initial_state = torch.tensor(self.env.reset()).unsqueeze(0).float()
            mean_cost = self.policy.rp_validate(num_particle, initial_state, 10 ,self.cost, model_f, gamma=1)
            total_cost10.append(mean_cost)
        mean_cost10 = np.mean(total_cost10)
        std_cost10 = np.mean(total_cost10)

        total_cost100 = []
        for i in range(20):
            initial_state = torch.tensor(self.env.reset()).unsqueeze(0).float()
            mean_cost = self.policy.rp_validate(num_particle, initial_state, 100 ,self.cost, model_f, gamma=1)
            total_cost100.append(mean_cost)
        mean_cost100 = np.mean(total_cost100)
        std_cost100 = np.mean(total_cost100)

        return mean_cost10, std_cost10, mean_cost100, std_cost100
        """


#Learning-Planning iterations

In [7]:
time1 = time.time()
testing_reward_list = []
mean_training_reward_list = []
std_training_reward_list = []
device = 'cuda'

behaviour_uncertainty = False
deterministic = False
plan = 'rp'
num_data = 10

torch.manual_seed(1)
agent = Agent(env_case = 1, device = device)
for i in range(num_data):
     _ = agent.env_rollout(True, behaviour_uncertainty = behaviour_uncertainty,plan = 'random')

agent.model_learning(num_epoch=1000, num_batch = 10)
#mean_reward, std_reward = 
agent.policy_learning(imagine_num=50, num_particle = 1000, num_epoch = 1, batch_size = 10, horizon = 10, plan = plan)
#mean_training_reward_list.append(mean_reward)
#std_training_reward_list.append(std_reward)

print("--------------------Testing------------------")
avg_rewards = 0
for j in range(20):
    rewards = agent.env_rollout(if_remember=False, behaviour_uncertainty = behaviour_uncertainty,plan = plan)
    print(j, rewards)
    avg_rewards += rewards
avg_rewards = avg_rewards/20
testing_reward_list.append(avg_rewards)
print('Total trajs:', j, avg_rewards)
if avg_rewards > 200:
    print('success')



mean_cost10_list = []
std_cost10_list = []
mean_cost100_list = []
std_cost100_list = []

avg_data_length_list = []

imagine_pred_list = []
imagine_action_list = []

for i in range(100):
    print('Epoch = ',i+1)
    _ = agent.env_rollout(True, behaviour_uncertainty = behaviour_uncertainty,plan = plan)
    total = 0
    for i in range(len(agent.observations_list)):
        total+=len(agent.observations_list[i])
    print('Average training data length = ',total/len(agent.observations_list))
    avg_data_length_list.append(total/len(agent.observations_list))
    
    agent.model_learning(num_epoch=1000, num_batch = 10)

    #mean_reward, std_reward = 
    agent.policy_learning(imagine_num=50, num_particle = 1000, num_epoch = 1, batch_size = 10, horizon = 10, plan = plan)
    #mean_training_reward_list.append(mean_reward)
    #std_training_reward_list.append(std_reward)
    """
    if i%10==0:
        initial_state = torch.tensor(agent.env.reset()).unsqueeze(0).float()
        mean_pred, std_pred, action_matrix= agent.policy.rp_validate(num_particle=1000, initial_state = initial_state, 
                                                horizon = 100 ,cost_f = agent.cost, model_imagine_f=
                                                agent.model.imagine, gamma=1, print_trajectory = True)
        imagine_pred_list.append(mean_pred)
        imagine_action_list.append(torch.stack(action_matrix)[:,0,:])
    """
    """
    total_cost10 = []
    for i in range(20):
        initial_state = torch.tensor(agent.env.reset()).unsqueeze(0).float()
        mean_cost = agent.policy.rp_validate(num_particle=1000, initial_state = initial_state, 
                                             horizon = 10 ,cost_f = agent.cost, model_imagine_f=
                                             agent.model.imagine, gamma=1)
        total_cost10.append(mean_cost)
    mean_cost10 = np.mean(total_cost10)
    std_cost10 = np.mean(total_cost10)
    mean_cost10_list.append(mean_cost10)
    std_cost10_list.append(std_cost10)

    total_cost100 = []
    for i in range(20):
        initial_state = torch.tensor(agent.env.reset()).unsqueeze(0).float()
        mean_cost = agent.policy.rp_validate(num_particle=1000, initial_state = initial_state, 
                                             horizon = 100 ,cost_f = agent.cost, model_imagine_f=
                                             agent.model.imagine, gamma=1)        
        total_cost100.append(mean_cost)
    mean_cost100 = np.mean(total_cost100)
    std_cost100 = np.mean(total_cost100)
    mean_cost100_list.append(mean_cost100)
    std_cost100_list.append(std_cost100)
    """

    print("--------------------Testing------------------")
    avg_rewards = 0
    for j in range(20):
        rewards = agent.env_rollout(if_remember=False, behaviour_uncertainty = behaviour_uncertainty,plan = plan)
        print(j, rewards)
        avg_rewards += rewards
    avg_rewards = avg_rewards/20
    testing_reward_list.append(avg_rewards)
    print('Total trajs:', j, avg_rewards)
    if avg_rewards > 200:
        print('success')


time2 = time.time()
print(time2-time1)


CartPoleModEnv - Version 0.2.0, Noise case: 1




Epoch:0; loss = 0.4223226308822632.
Epoch:500; loss = 0.0034770409110933542.
--------------------Testing------------------
0 20
1 17
2 24
3 16
4 19
5 19
6 16
7 15
8 19
9 16
10 15
11 19
12 21
13 20
14 22
15 16
16 16
17 21
18 19
19 17
Total trajs: 19 18.35
Epoch =  1
Average training data length =  16.181818181818183
Epoch:0; loss = 0.0007143482798710465.
Epoch:500; loss = 0.0007419626927003264.


KeyboardInterrupt: ignored