<a href="https://colab.research.google.com/github/bkgsur/foundationsofdrl/blob/main/REINFORCE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from torch.distributions import Categorical
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
#Constants
GAMMA = 0.99
LEARNING_RATE= 0.01
HIDDEN_UNITS= 64
EPISODES = 300
TIME_UNITS=200
TERMINATION = 195

In [None]:
class Policy(nn.Module):
  def __init__(self, in_dim,out_dim):
    super(Policy,self).__init__()
    layers = [nn.Linear(in_dim,HIDDEN_UNITS),nn.ReLU(),nn.Linear(HIDDEN_UNITS,out_dim)]
    self.policynetwork = nn.Sequential(*layers)
    self.onpolicy_reset()
    self.train()

  def onpolicy_reset(self):
    self.log_probablity_actions = []
    self.rewards = []

  def action_based_on_state(self, state):     
    policydistribution  = Categorical(logits = self.model(torch.from_numpy(state.astype(np.float32)))) # policy distibution based on state  - pi(a|s) action based on states  
    action = policydistribution.sample()# sample action from policy  
    return action.item(),policydistribution.log_prob(action)   

def train_per_episode(policy,optimizer): 
  returns =  np.empty(TIME_UNITS,dtype=np.float32)
  future_return=0.0
  for t in reversed(range(TIME_UNITS)):
    future_return = policy.rewards[t] +  GAMMA * future_return
    returs[t] = future_return
  returns = torch.tensor(returns)
  log_probablity_actions = torch.stack(policy.log_probablity_actions)
  loss = torch.sum(-log_probablity_actions*returns)
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
  return loss

  def main():
    env = gym.make("CartPole-v0")
    in_dim = env.observation_space.shape[0]
    out_dim = env.action_space.n
    policy = Policy(in_dim,out_dim)
    optimizer = optim.Adam(policy.parameters(), lr= LEARNING_RATE)

    for epi in range(EPISODES):
      state = env.reset()
      for t in range(TIME_UNITS):
        action,log_probablity_action = policy.act(state)
        state,reward,done,_ = env.step(action)
        policy.log_probablity_actions.Add(log_probablity_action)
        policy.rewards.Add(reward)
        env.render()
        if done:
          break
        loss = train (policy, optimizer)
        total_reward = sum(policy.rewards)
        solved = total_reward> TERMINATION
        policy.onpolicy_reset()
        print(f'Episode: {epi}, loss: {loss}, total reward:{total_reward}, solved: {solved}')

  
  if __name__ == '__main__':
    main()













