<a href="https://colab.research.google.com/github/bkgsur/foundationsofdrl/blob/main/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
from torch.distributions import Categorical
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [29]:
#Constants
GAMMA = 0.99
LEARNING_RATE= 0.01
HIDDEN_UNITS= 64
EPISODES = 300
TIME_UNITS=200
TERMINATION = 195
ENV_NAME = "CartPole-v0"

In [32]:
class Policy(nn.Module):
  def __init__(self, in_dim,out_dim):
    super(Policy,self).__init__()
    layers = [nn.Linear(in_dim,HIDDEN_UNITS),nn.ReLU(),nn.Linear(HIDDEN_UNITS,out_dim)]
    self.policynetwork = nn.Sequential(*layers)
    self.log_probablity_actions = []
    self.rewards = []
    self.train()
  def action_based_on_state(self, state):     
    policydistribution  = Categorical(logits = self.policynetwork(torch.from_numpy(state.astype(np.float32)))) 
    action = policydistribution.sample()# sample action from policy  
    return action.item(),policydistribution.log_prob(action)   

In [45]:
def train_per_episode(rewards,log_probablity_actions,optimizer): 
  returns = torch.tensor(returns_per_episode(rewards))
  log_probablity_actions = torch.stack(log_probablity_actions)
  loss = torch.sum(-log_probablity_actions*returns)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  return loss

#Return for episode
def returns_per_episode(rewards):
  T = len(rewards)
  #print(T)
  returns =  np.empty(T,dtype=np.float32)
  future_return=0.0
  for t in reversed(range(T)):
    future_return = rewards[t] +  GAMMA * future_return
    returns[t] = future_return
  return returns

def main():
  env = gym.make(ENV_NAME)
  in_dim = env.observation_space.shape[0]
  out_dim = env.action_space.n
  policy = Policy(in_dim,out_dim)  
  optimizer = optim.Adam(policy.parameters(), lr= LEARNING_RATE)

  for epi in range(EPISODES):
    state = env.reset()
    policy.log_probablity_actions = []
    policy.rewards = []
    for t in range(TIME_UNITS):
      action,log_probablity_action = policy.action_based_on_state(state)
      state,reward,done,_ = env.step(action)
      policy.log_probablity_actions.append(log_probablity_action)
      policy.rewards.append(reward)
      #env.render()
      if done:
        break
    loss = train_per_episode(policy.rewards,policy.log_probablity_actions, optimizer)
    total_reward = sum(policy.rewards)
    solved = total_reward> TERMINATION
    print(f'Episode: {epi}, loss: {loss}, total reward:{total_reward}, solved: {solved}')
    
    
      


if __name__ == '__main__':
  main()


Episode: 0, loss: 231.34759521484375, total reward:27.0, solved: False
Episode: 1, loss: 75.57781982421875, total reward:14.0, solved: False
Episode: 2, loss: 825.8405151367188, total reward:54.0, solved: False
Episode: 3, loss: 114.6308822631836, total reward:18.0, solved: False
Episode: 4, loss: 696.83984375, total reward:50.0, solved: False
Episode: 5, loss: 108.98060607910156, total reward:17.0, solved: False
Episode: 6, loss: 1001.2152099609375, total reward:62.0, solved: False
Episode: 7, loss: 959.98583984375, total reward:59.0, solved: False
Episode: 8, loss: 182.1405029296875, total reward:24.0, solved: False
Episode: 9, loss: 1867.174560546875, total reward:89.0, solved: False
Episode: 10, loss: 598.1310424804688, total reward:46.0, solved: False
Episode: 11, loss: 124.82347106933594, total reward:18.0, solved: False
Episode: 12, loss: 408.1468505859375, total reward:39.0, solved: False
Episode: 13, loss: 1013.0557250976562, total reward:61.0, solved: False
Episode: 14, loss: