In [1]:
import gym
import time
import pybulletgym  # register PyBullet enviroments with open ai gym
import numpy as np
import matplotlib.pyplot as plt
from statistics import mean
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.multivariate_normal import MultivariateNormal

np.set_printoptions(suppress=True, precision=4)

In [2]:
class Policy(nn.Module):

    def __init__(self, num_classes, lr):
        super(Policy, self).__init__()
        
        self.sigma0 = torch.nn.Parameter(torch.FloatTensor([0.1]))
        self.sigma1 = torch.nn.Parameter(torch.FloatTensor([0.1]))
        
        self.fc1 = nn.Linear(8, 64)
        self.hidden = nn.Linear(64, 64)
        self.fc2 = nn.Linear(64, num_classes)
        self.tanh = nn.Tanh()
        
#         print(list(self.parameters()))
        
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        
    def forward(self, x):
        
        x = self.fc1(x)
        x = self.tanh(x)
        x = self.hidden(x)
        x = self.tanh(x)
        x = self.hidden(x)
        x = self.tanh(x)
        x = self.fc2(x)
        x = self.tanh(x)

        s0 = torch.relu(self.sigma0)+0.001
        s1 = torch.relu(self.sigma1)+0.001

        return x, s0, s1

def makePolicyNN(num_actions=2, lr=0.01):
    ''' Initialize the policy class '''
    assert isinstance(num_actions, int) and num_actions>0
    
    return Policy(num_actions, lr)

In [3]:
class PolicyGradient:
    def __init__(self, gamma, N=500, max_steps=1000):
        ''' Initialize the Reacher PyBullet environment '''
        assert isinstance(gamma, float) and 0.0<gamma<1.0, 'Invalid gamma'
        assert isinstance(N, int) and N>0
        assert isinstance(max_steps, int) and max_steps>0
        
        self.env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1", rand_init=False)
        self.gamma = gamma
        self.N = N
        self.max_steps = max_steps
        self.numActions = 2
        self.numObs = 8
        self.num_steps = 0
        self.max_steps_reached = False
        
    def saveModel(self, index):
        ''' Saves the policy NN'''
        filename = 'q2_policy' + str(index) + '.pth.tar'
        torch.save(self.state, filename)

    
    def getAction(self, policy_network, state):
        ''' Return an action from a stochastic policy '''
        assert isinstance(state, np.ndarray) and len(state) == self.numObs
        
        state = torch.from_numpy(state).float()
        torque_mean, sigma0, sigma1 = policy_network(state) #forward pass
        
        # Sampling from the 2D Gaussian and calculating the actions log probability
        torque_sigma = torch.tensor([[sigma0,0],[0,sigma1]])
        m = MultivariateNormal(torque_mean, torque_sigma)
        action = m.sample() #type tensor
        log_prob_of_action = m.log_prob(action) #type tensor

        return action, log_prob_of_action
    
    def runEpisode(self, policy_network):
        ''' Generate [s_t, a_t, r_t] pairs for one episode '''
        
        state = self.env.reset()
        done = False
        rewards, log_prob_of_actions = [], []
        while not(done):
            a, lpa = self.getAction(policy_network, state)
            state, r, done, info = self.env.step(np.array(a))
            r = rewards.append(r)
            log_prob_of_actions.append(lpa)
            self.num_steps += 1 #incrementing total number of steps in one iteration
            if self.num_steps >= self.max_steps: #checking
                self.max_steps_reached = True
                break;
        
        return rewards, log_prob_of_actions
    
    def doReinforcePart1(self, policy_network, verbose=False):
        ''' Improve policy by implementing vanilla version of Reinforce algo '''
        
        gamma = self.gamma
        arr_objective = []
        avg_returns = np.zeros((self.N,))
        num_episodes = np.zeros((self.N,))
        sigma = np.zeros((self.N, 2))
        
        for i in range(self.N): #improving policy for "self.N" number of iterations
            n, objective = 0, 0
            policy_network.optimizer.zero_grad()
            
            # Resetting variables
            self.num_steps = 0
            self.max_steps_reached = False
            
            # Looping until a maximum number of steps are taken
            while not(self.max_steps_reached):
                # Run an episode with policy and count number of steps taken
                r, log_prob_of_actions = self.runEpisode(policy_network)
                n += 1 #increasing the episode count by 1
                
                # Calculate discounted return (G_tau) and summation of log probs of actions from the episode
                G_tau = sum([ r[t] * gamma**t for t in range(len(r))])
                sum_lpa = sum(log_prob_of_actions)
                avg_returns[i] += sum(r)
                
                # Calculate objective
                objective += G_tau * sum_lpa
                assert isinstance(objective, torch.Tensor)
            
            avg_returns[i] /= n
            num_episodes[i] = n
            sigma0, sigma1 = policy_network.sigma0.data.item(), policy_network.sigma1.data.item()
            sigma[i,:] = np.array([sigma0, sigma1])
            
            # Updating policy
            
            objective = -objective/n #averaging over n episodes and flipping sign so it does gradient ascent
            objective.backward()
            # print gradients
            for param in policy_network.parameters():
                print(param.grad)
            break;
            policy_network.optimizer.step()
            
            arr_objective.append(objective.item())

            if verbose and (i%2 == 0):
                print('Iteration: {0} \t Objective: {1:.3f} \t Average reward: {2:.3f} \t Num_episodes: {3}'\
                      .format(i, objective, avg_returns[i], num_episodes[i]))
                
        self.state = { 'state_dict': policy_network.state_dict(),
                      'optimizer': policy_network.optimizer.state_dict() }

        return policy_network, arr_objective, avg_returns, num_episodes, sigma

In [5]:
torch.manual_seed(6)
render_env = True
plot = True

policy_network = makePolicyNN(num_actions=2, lr=1e-3)
# list(policy_network.parameters())
pg = PolicyGradient(gamma=0.9, N = 60, max_steps=6000)
policy_network, arr_objective, avg_returns, num_episodes, sigma = pg.doReinforcePart1(policy_network, verbose=True)

options= 
None
None
tensor([[ 1.1910e-01,  1.1910e-01, -1.4613e-01, -2.1677e-01, -3.1888e-02,
         -4.3922e-02, -3.0235e-02,  3.4058e-01],
        [-2.4177e-03, -2.4177e-03, -1.1277e-02,  2.2228e-02, -2.1582e-01,
         -5.6868e-02,  1.1229e-02, -4.7727e-04],
        [-9.6903e-02, -9.6903e-02,  1.3386e-01,  8.8808e-02, -1.7107e-01,
         -6.7930e-02,  6.2667e-02,  3.6229e-02],
        [-5.4142e-02, -5.4142e-02,  1.1972e-01, -1.7947e-02,  8.8051e-01,
          1.7811e-01, -3.7036e-02, -1.3609e-01],
        [-2.7714e-02, -2.7714e-02,  4.5197e-02,  2.5448e-02,  2.0396e-01,
          6.3895e-02,  1.1836e-03, -8.5996e-02],
        [ 2.1287e-02,  2.1287e-02, -7.2679e-02,  2.6392e-02, -5.5362e-01,
         -1.2462e-01,  6.2141e-03, -1.0853e-01],
        [ 3.9491e-02,  3.9491e-02,  4.1117e-02, -1.2994e-01,  8.1095e-01,
          3.9204e-02, -1.1423e-01,  1.3426e-01],
        [-1.4537e-01, -1.4537e-01,  2.2103e-01,  1.1879e-01,  6.6551e-01,
          1.1444e-01,  7.3860e-03, -2.0422e-0

In [None]:
if plot:
    fig, axs = plt.subplots(2, 2, figsize=(10,10))
    axs = axs.flatten()
    axs[0].plot(arr_objective)
    axs[0].set_xlabel('Iteration')
    axs[0].set_ylabel('Objective')
    axs[0].set_title(' Objective vs iterations ')
    axs[1].plot(avg_returns)
    axs[1].set_xlabel('Iteration')
    axs[1].set_ylabel('Average Rewards')
    axs[1].set_title(' Average Rewards vs iterations ')
    axs[2].plot(num_episodes)
    axs[2].set_title(' Number of episodes vs iterations ')
    axs[3].plot(sigma[:,0], '-r')
    axs[3].plot(sigma[:,1], '-b')
    axs[3].set_title(' Sigma values vs iterations ')
    axs[3].legend(['sigma1', 'sigma2'])
    
    plt.show()

In [None]:
if render_env:
    env = gym.make("modified_gym_env:ReacherPyBulletEnv-v1", rand_init=False)
    steps = 0
    env.render('human')
    state = env.reset()
    done = False
    time.sleep(3)
    while steps<300:
        a, _ = pg.getAction(policy_network, state)
        state, r, done, info = env.step(np.array(a))
        steps+=1
        env.render('human')
        time.sleep(0.1)
    env.env.close()

In [None]:
# policy_network.saveModel(2)

In [7]:
for name, param in policy_network.named_parameters():
    print(name, param.grad)

sigma0 None
sigma1 None
fc1.weight tensor([[ 1.1910e-01,  1.1910e-01, -1.4613e-01, -2.1677e-01, -3.1888e-02,
         -4.3922e-02, -3.0235e-02,  3.4058e-01],
        [-2.4177e-03, -2.4177e-03, -1.1277e-02,  2.2228e-02, -2.1582e-01,
         -5.6868e-02,  1.1229e-02, -4.7727e-04],
        [-9.6903e-02, -9.6903e-02,  1.3386e-01,  8.8808e-02, -1.7107e-01,
         -6.7930e-02,  6.2667e-02,  3.6229e-02],
        [-5.4142e-02, -5.4142e-02,  1.1972e-01, -1.7947e-02,  8.8051e-01,
          1.7811e-01, -3.7036e-02, -1.3609e-01],
        [-2.7714e-02, -2.7714e-02,  4.5197e-02,  2.5448e-02,  2.0396e-01,
          6.3895e-02,  1.1836e-03, -8.5996e-02],
        [ 2.1287e-02,  2.1287e-02, -7.2679e-02,  2.6392e-02, -5.5362e-01,
         -1.2462e-01,  6.2141e-03, -1.0853e-01],
        [ 3.9491e-02,  3.9491e-02,  4.1117e-02, -1.2994e-01,  8.1095e-01,
          3.9204e-02, -1.1423e-01,  1.3426e-01],
        [-1.4537e-01, -1.4537e-01,  2.2103e-01,  1.1879e-01,  6.6551e-01,
          1.1444e-01,  7.3860e

In [None]:
list(policy_network.parameters())

In [None]:
for param in policy_network.parameters():
    print(param.grad)