In [1]:
import gym
# import pybulletgym.envs
import pybulletgym  # register PyBullet enviroments with open ai gym
import numpy as np
import matplotlib.pyplot as plt
from statistics import mean
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.multivariate_normal import MultivariateNormal

np.set_printoptions(suppress=True, precision=4)

In [9]:
class Policy(nn.Module):

    def __init__(self, num_classes, lr):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(9, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.tanh = nn.Tanh()
        
#         self.sigma0 = nn.parameter(torch.tensor([0.1]))
#         self.sigma1 = nn.parameter(torch.tensor([0.1]))
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        
    def forward(self, x):
        
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.fc2(x)
        x = self.tanh(x)
        
        sigma0 = torch.relu(self.sigma0)
        sigma1 = torch.relu(self.sigma1)
        
        if self.sigma0 <= 0.001:
            sigma0 = sigma0 + 0.001
        
        if self.sigma1 <= 0.001:
            sigma1 = sigma1 + 0.001
        
        return x, sigma0, sigma1

def makePolicyNN(num_actions=2, lr=0.01):
    ''' Initialize the policy class '''
    assert isinstance(num_actions, int) and num_actions>0
    
    return Policy(num_actions, lr)

In [14]:
class PolicyGradient:
    def __init__(self, gamma, N=500, max_steps=1000):
        ''' Initialize the Reacher PyBullet environment '''
        assert isinstance(gamma, float) and 0.0<gamma<1.0, 'Invalid gamma'
        assert isinstance(N, int) and N>0
        assert isinstance(max_steps, int) and max_steps>0
        
        self.env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1", rand_init=False)
        self.gamma = gamma
        self.N = N
        self.max_steps = max_steps
        self.numActions = self.env.action_space.shape[0]
        self.numObs = self.env.observation_space.shape[0]
        self.num_steps = 0
        self.max_steps_reached = False
    
    def getAction(self, policy_network, state):
        ''' Return an action from a stochastic policy '''
        print(state)
        assert isinstance(state, np.ndarray) and len(state) == self.numObs
        
        state = torch.from_numpy(state).float()
        torque_mean, sigma0, sigma1 = policy_network(state) #forward pass
        # Defining diagonal elements of cov matrix 
        torque_cov = torch.eye(2)
        torque_cov[0,0], torque_cov[1,1] = sigma0, sigma1
        # Sampling from the 2D Gaussian and calculating the actions log probability
        m = MultivariateNormal(torque_mean, torque_cov)
        action = m.sample() #type tensor
        log_prob_of_action = m.log_prob(action) #type tensor
        action = action.item() #extracting value from tensor
        
        assert isinstance(action, (int, float))
        
        return action, log_prob_of_action
    
    def runEpisode(self, policy_network):
        ''' Generate [s_t, a_t, r_t] pairs for one episode '''
        
        state = self.env.reset()
        done = False
        rewards, log_prob_of_actions = [], []
        while not(done):
            a, lpa = self.getAction(policy_network, state)
            state, r, done, info = self.env.step(a)
            r = rewards.append(r)
            log_prob_of_actions.append(lpa)
            self.num_steps += 1 #incrementing total number of steps in one iteration
            if self.num_steps >= self.max_steps: #checking
                self.max_steps_reached = True
                break;
        
        return rewards, log_prob_of_actions
    
    def doReinforcePart1(self, policy_network, verbose=False):
        ''' Improve policy by implementing vanilla version of Reinforce algo '''
        
        gamma = self.gamma
        arr_objective = []
        avg_returns = np.zeros((self.N,))
        num_episodes = np.zeros((self.N,))
        
        for i in range(self.N): #improving policy for "self.N" number of iterations
            n, objective = 0, 0
            
            # Resetting variables
            self.num_steps = 0
            self.max_steps_reached = False
            
            # Looping until a maximum number of steps are taken
            while not(self.max_steps_reached):
                # Run an episode with policy and count number of steps taken
                r, log_prob_of_actions = self.runEpisode(policy_network)
                n += 1 #increasing the episode count by 1
                
                # Calculate discounted return (G_tau) and summation of log probs of actions from the episode
                G_tau = sum([ r[t] * gamma**t for t in range(len(r))])
                sum_lpa = sum(log_prob_of_actions)
                avg_returns[i] += sum(r)
                
                # Calculate objective
                objective += G_tau * sum_lpa
                assert isinstance(objective, torch.Tensor)
            
            avg_returns[i] /= n
            num_episodes[i] = n
            # Updating policy
            policy_network.optimizer.zero_grad()
            objective = -objective/n #averaging objective over n episodes and flipping sign so it does gradient ascent
            objective.backward()
            policy_network.optimizer.step()
            arr_objective.append(objective.item())

            if verbose and (i%20 == 0):
                print('Iteration: {0} \t Objective: {1:.3f} \t Average reward: {2:.3f} \t Num_episodes: {3}'\
                      .format(i, objective, avg_returns[i], num_episodes[i]))

        return policy_network, arr_objective, avg_returns, num_episodes

In [15]:
if __name__ == '__main__':
    torch.manual_seed(1)
    render_env = True
    plot = True
    
    policy_network = makePolicyNN(num_actions=2, lr=1e-3)
    pg = PolicyGradient(gamma=0.9, N = 500, max_steps=500)
    rewards, log_prob_of_actions = pg.runEpisode(policy_network)
    print(rewards, log_prob_of_actions)
#     policy_network, arr_objective, avg_returns, num_episodes = pg.doReinforcePart1(policy_network, verbose=True)

options= 
[ 0.3928  0.3928 -0.6809  0.2656  0.5     0.      0.0833  0.    ]


RuntimeError: size mismatch, m1: [1 x 8], m2: [9 x 128] at /Users/distiller/project/conda/conda-bld/pytorch_1570710876898/work/aten/src/TH/generic/THTensorMath.cpp:197