#GYM

**With the help of the given CartPole_REINFORCE_PyTorch.ipynb code,
train your RL model to achieve the max reward of 500.**

**Verify that the model can achieve a reward of 500.
You can do this by loading the trained model before running 100
episodes with it to find the average reward per episode for these 100
episodes. Print the
rewards obtained at every 10 episodes.**

In [None]:
import gym
import torch
import torch.nn as nn
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

print(gym.envs.registry.keys()) # To check all env available

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v2', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'Reacher-v2', 'Reacher-v4', 'Pusher-v2', 'Pusher-v4', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Humanoid-v2', 'Humanoid-v3', 'Humanoid-v4', 'HumanoidStandup-v2', 'HumanoidStandup-v4'])


## Setup Gym Env

In [None]:
'''
Depending on the env:
        - None (default): no render is computed.
        - human: render return None.
          The environment is continuously rendered in the current display or terminal. Usually for human consumption.
        - rgb_array: return a single frame representing the current state of the environment.
          A frame is a numpy.ndarray with shape (x, y, 3) representing RGB values for an x-by-y pixel image.
        - rgb_array_list: return a list of frames representing the states of the environment since the last reset.
          Each frame is a numpy.ndarray with shape (x, y, 3), as with `rgb_array`.
        - ansi: Return a strings (str) or StringIO.StringIO containing a
          terminal-style text representation for each time step.
          The text can include newlines and ANSI escape sequences (e.g. for colors).
'''
env = gym.make("CartPole-v1", new_step_api=True, render_mode="human") # modes avail: ['human', 'rgb_array', 'single_rgb_array']
                                                                      # 'human' means video is off, only verbose output

In [None]:
#
# A model can be defined in PyTorch by subclassing the torch.nn.Module class.
# This is the PyTorch base class meant to encapsulate behaviors specific to PyTorch Models and their components.
#
# The model is defined in two steps. We first specify the layer definition of the model,
# and then outline how they are applied to the inputs. Here’s a simple model with
# two linear layers and an activation function:
#
# class TinyModel(nn.Module):
#    def __init__(self):
#        super(TinyModel, self).__init__()
#        self.linear1 = nn.Linear(D_in, H1) #Dim of input = D_in
#        self.activation = nn.ReLU() # Dim of hidden layer = H1 (num of nodes)
#        self.linear2 = nn.Linear(H1, D_out) #Dim of output = D_out
#        self.softmax = nn.Softmax(dim=1)
#
#    def forward(self, x):
#        x = self.linear1(x)
#        x = self.activation(x)
#        x = self.linear2(x)
#        x = self.softmax(x)
#        return x
#
# tinymodel = TinyModel()
#
# You may have noticed that we define the SoftMax activation for the final layer in this model.
# This is because the CrossEntropyLoss function is not used here (remember we said that it has already
# combined both a SoftMax activation and the cross entropy loss function inside).
#
# For PG, it is the sum of (log_prob*reward) of every steps. It is the objective function J(theta)
# which we are attempting to maximize here.
#
class PolicyModel(nn.Module):
    def __init__(self):
        #super(PolicyModel, self).__init__()
        super().__init__()
        self.linear1 = nn.Linear(env.observation_space.shape[0], 20) # input dim=4, 20
        self.linear2 = nn.Linear(20, 30) #hidden layer dimensions 20 & 30 (nodes) - 2 hidden layers
        self.linear3 = nn.Linear(30, env.action_space.n) # output dim=2
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1) #softmax over dim (col)

        # Storages used during a trajectory
        self.saved_log_probs = [] #stores ln(prob) of corresponding action chosen randomly during sampling in a trajectory
        self.rewards = [] #stores rewards obtained during trajectory

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.activation(x)
        x = self.linear3(x)
        x = self.softmax(x)
        return x

In [None]:
# Load the trained PyTorch model from disk
model = torch.load("/content/drive/MyDrive/policyNet.pt", weights_only=False)

# Set the model to evaluation mode (disables dropout, batch norm, etc.)
model.eval()

PolicyModel(
  (linear1): Linear(in_features=4, out_features=20, bias=True)
  (linear2): Linear(in_features=20, out_features=30, bias=True)
  (linear3): Linear(in_features=30, out_features=2, bias=True)
  (activation): ReLU()
  (softmax): Softmax(dim=1)
)

In [None]:
# Initialize total rewards across all episodes
Total_Rewards = 0

# Run the evaluation for 100 episodes
for num_episode in range(1, 101):  # Run for 100 episodes
    rewards = 0 # Track cumulative reward for the current episode
    state = env.reset() # Reset environment to start a new episode
    done = False # Flag to track end of episode

    # Run the episode until it's done
    while not done:
        # Convert current state to a tensor and add batch dimension
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        # Predict action probabilities using the model (no gradient needed)
        with torch.no_grad():
            action_probs = model(state_tensor)

        # Select the action with the highest probability
        action = torch.argmax(action_probs, dim=1).item()

        # Perform the selected action in the environment
        state, reward, terminated, truncated, _ = env.step(action)

        # Episode ends if either termination or truncation is true
        done = terminated or truncated
        rewards += reward # Accumulate the reward

    # Print every 10th episode's reward
    Total_Rewards += rewards
    if num_episode % 10 == 0:
        print(f"Episode {num_episode}: {rewards}")

# Print average reward over all episodes
print("Average", Total_Rewards / 100)

Episode 10: 500.0
Episode 20: 500.0
Episode 30: 500.0
Episode 40: 500.0
Episode 50: 500.0
Episode 60: 500.0
Episode 70: 500.0
Episode 80: 500.0
Episode 90: 472.0
Episode 100: 500.0
Average 494.54
