In [None]:
# importing packages
import sys
import random
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
from collections import namedtuple, deque
import wandb
from plyer import notification

In [None]:
# Constants
GAMMA = 0.99

# Hyperparameters
learning_rate = 0 

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()
        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.linear3 = nn.Linear(hidden_size, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        action_scores = self.linear2(x)
        state_values = self.linear3(x)
        return F.softmax(action_scores, dim=1), state_values
    
    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs, state_value = self.forward(Variable(state))
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob, state_value

In [None]:
def compute_returns(rewards):
    discounted_rewards = []
    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
    return discounted_rewards

def update_policy(policy_network, rewards, log_probs, state_values):
    discounted_rewards = compute_returns(rewards)
    discounted_rewards = torch.tensor(discounted_rewards)
    policy_gradient = []
    value_loss = []
    for log_prob, value, Gt in zip(log_probs, state_values, discounted_rewards):
        advantage = Gt - value.item()
        policy_gradient.append(-log_prob * advantage)
    for i in range(0, len(state_values)-1): 
        value_loss.append(F.smooth_l1_loss(rewards[i]+GAMMA*state_values[i+1], state_values[i]))
    policy_network.optimizer.zero_grad()
    loss = torch.stack(policy_gradient).sum() + torch.stack(value_loss).sum()
    loss.backward()
    policy_network.optimizer.step()

In [None]:
def train(environment, render=False, hidden_size=128, learning_rate=3e-4, max_episode_num=750, max_steps=1000):
    env = gym.make(environment["name"])
    env.seed(random.randint(0,100))
    policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, hidden_size, learning_rate).to(device)
    reward_sat = False
    all_rewards = []
    scores_window = deque(maxlen=100)
    for episode in range(max_episode_num):
        state = env.reset() 
        log_probs = []
        state_values = []
        rewards = []
        episode_reward = 0
        for steps in range(max_steps):
            if render:
                env.render()
            action, log_prob, state_value = policy_net.get_action(state)
            new_state, reward, done, _ = env.step(action)
            episode_reward+=reward
            rewards.append(reward)
            log_probs.append(log_prob)
            state_values.append(state_value)
            if done:
                scores_window.append(episode_reward)
                if not reward_sat:
                    update_policy(policy_net, rewards, log_probs, state_values)
                else:
                    plot_post_sat-=1
                all_rewards.append(episode_reward)
                break
            state = new_state

        if episode % 100 == 0:
            print('\rEpisode {}\tTotal Reward: {:.2f}\tAverage Reward: {:.2f}'.format(episode, episode_reward, np.mean(scores_window)))
        if np.mean(scores_window) >= environment["avg_reward_threshold"]:
            print('\nEnvironment solved in {:d} episodes!\tAverage Reward: {:.2f}'.format(episode, np.mean(scores_window)))
            reward_sat=True
    
    env.close()
    return all_rewards

In [None]:
def run_agent(environment, learning_rates):
    all_scores = []
    for lr in learning_rates:
        five_runs=[]
        for _ in range(5):
            print("Run ", _, ": ", lr)
            scores = train(environment, False, 128, lr)
            five_runs.append(scores)
        all_scores.append(five_runs)
    notification.notify(title="Run Complete",
                        message="Your rewards have been plotted")
    return all_scores

In [None]:
# environment = {"name": 'Acrobot-v1', "avg_reward_threshold": -100}
# # learning_rates = np.linspace(2e-3, 9e-3, 15)
# learning_rates = [0.003]
# all_scores_1 = run_agent(environment, learning_rates)

In [None]:
environment = {"name": 'CartPole-v1', "avg_reward_threshold": 475}
# learning_rates = np.linspace(2e-3, 9e-3, 15)
learning_rates = [0.0025]
all_scores_2 = run_agent(environment, learning_rates)

In [None]:
# np.save("./plots/acrobot_without_baseline", np.array(all_scores_1))
np.save("./plots/cartpole_without_baseline", np.array(all_scores_2))