In [4]:
import gym
import numpy as np
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical

np.set_printoptions(suppress=True, precision=4)

In [5]:
class Policy(nn.Module):

    def __init__(self, num_classes=2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(4, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, num_classes)
        self.relu = nn.ReLU(inplace=True)
        self.softmax = nn.Softmax(dim=0) #important param to set --> dim
        


    def forward(self, x):
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x


def makePolicyNN(num_actions):
    ''' Initialize the policy class '''
    assert isinstance(num_actions, int) and num_actions>0
    
    return Policy(num_actions)


In [6]:
class PolicyGradient:
    def __init__(self, gamma, N, T):
        ''' Initialize the cart-pole environment '''
        assert isinstance(gamma, float) and 0.0<gamma<1.0, 'Invalid gamma'
        assert isinstance(N, int) and N>0
        assert isinstance(T, int) and T>0
        
        self.env = gym.make ("CartPole-v1")
        self.gamma = gamma
        self.N = N
        self.T = T
        self.numActions = self.env.action_space.n
    
    def getAction(self, policy_network, state):
        ''' Return an action from a stochastic policy '''
        assert isinstance(state, np.ndarray) and len(state) == 4
        
        state = torch.from_numpy(state).float()
        probs = policy_network(state)
        m = Categorical(probs)
        action = m.sample() #type tensor
        log_prob_of_action = m.log_prob(action)
        
        assert isinstance(action, int) and action in [0,1]
        
        return action.item(), log_prob_of_action.item()
    
    def generateEpisode(self, policy):
        ''' Generate [s_t, a_t, r_t] pairs for one episode '''
        
        initial_state = self.env.reset()
        initial_action, log_prob_a = self.getAction(policy_network, initial_state)
        
        states = [initial_state]
        actions = [initial_action]
        rewards = [r]
        log_prob_of_actions = [log_prob_a]
        
        for t in range(self.T):
            a, log_prob_a = self.getAction(policy_network, states[-1])
            next_state, r, done, info = self.env.step(a)
            
            # Save next state and the action that led to it
            states.append(next_state)
            actions.append(a)
            rewards.append(r)
            log_prob_of_actions.append(log_prob_a)
            
            if done:
                break
        assert len(states) - len(actions) == 0, 'Number of actions should be equal to number of states'
        assert len(states) - len(rewards) == 0, 'Number of rewards should be equal to number of states'

        episode = (states, actions, rewards, log_prob_of_actions)
        return episode
    
    def calculateTerms(self, episode):
        ''' Calculate terms of the objective function '''
        assert isinstance(episode, tuple) and len(episode) == 4
        gamma = self.gamma
        rewards, log_prob_of_actions = episode[1], episode[3]
        term1 = np.sum([rewards[t]*(gamma**t) for t in range(len(rewards))]) # calculating G(tau) which is Monte Carlo estimate of discounted return
        term2 = np.sum(log_prob_of_actions)
        
        return term1, term2
    
    def calculateObjective(self, policy):
        ''' Calculate J(theta) '''
        
        objective = 0
        for i in range(self.N):
            episode_i = self.generateEpisode(policy)
            term1, term2 = self.calculateTerms(episode_i)
            objective += term1*term2
        
        return objective/self.N
    
    def doVanillaReinforce(self, policy, max_iter):
        ''' Improve policy by implementing vanilla version of Reinforce algo '''
        for i in range(max_iter):
            objective = self.calculateObjective(policy)
            objective = torch.Tensor(objective)
            objective.backward() #calculate gradient
            policy_network.optimizer.step() #take a step in the direction of gradient
            

# Need to calculate $ J(\theta) \approx \sum_{i=0}^{N} G(\tau_{i}) \sum_{t=0}^{T} log\pi_{\theta}(a_{t} | s_{t}) $
## Term 1 = $ G(\tau_{i}) = \sum_{t=0}^{T} r_{t}$
## Term 2 = $ \sum_{t=0}^{T} log\pi_{\theta}(a_{t} | s_{t}) $

In [8]:
policy_network = makePolicyNN(num_actions = 2)
policy_network = policy_network

pg = PolicyGradient(gamma=0.9)
episode = pg.generateEpisode(policy_network, 20)

log_prob_of_action =  tensor(-0.6722, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.6716, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.6726, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.6737, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.7109, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.7128, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.7119, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.7117, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.7110, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.6761, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.7107, grad_fn=<SqueezeBackward1>)
log =  -inf
log_prob_of_action =  tensor(-0.6764, grad_fn=<SqueezeBackward1>)
log =  0.0
log_prob_of_action =  tensor(-0.7103, grad_fn=<SqueezeBackward1>)
log 



In [10]:
episode[2]

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0]

In [None]:
# env = gym.make('CartPole-v1')
# env.reset()
# env.action_space.sample()
# state, r, done, info = env.step(0)
# q = torch.from_numpy(state).float()

# policy_network(q)
# print(type(policy_network))