In [13]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import pandas as pd

# hyperparameters
hidden_size = 256
learning_rate = 3e-4

# Constants
GAMMA = 0.99
num_steps = 300
max_episodes = 3000
epsilon = 0.10

In [14]:
class ActorCritic(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(ActorCritic, self).__init__()

        self.num_actions = num_actions
        self.critic_linear1 = nn.Linear(num_inputs, hidden_size)
        self.critic_linear2 = nn.Linear(hidden_size, 1)

        self.actor_linear1 = nn.Linear(num_inputs, hidden_size)
        self.actor_linear2 = nn.Linear(hidden_size, num_actions)
    
    def forward(self, state):
        state = Variable(torch.from_numpy(state).float().unsqueeze(0))
        
        #g3t the value
        value = F.relu(self.critic_linear1(state))
        value = self.critic_linear2(value)
        
        #get the new policy dist
        p_d = F.relu(self.actor_linear1(state))
        p_d = F.softmax(self.actor_linear2(p_d))
        
        return value, p_d

In [27]:
num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.n
ac = ActorCritic(num_inputs, num_outputs, hidden_size)
def train_actor_agent(env):
    
    
    
    ac_opt = optim.Adam(ac.parameters(), lr=learning_rate)
    
    all_lengths = []
    average_lengths = []
    all_rewards = []
    entropy_term = 0
    
    for episode in range(max_episodes):
        log_probs = []
        values = []
        rewards = []

        state = env.reset()
        for steps in range(num_steps):
            value, p_d = ac.forward(state)
            value = value.detach().numpy()[0,0]
            dist = p_d.detach().numpy()
            
            #epsilon greedy action choice
            exp = np.random.uniform(0, 1)
            if exp >= epsilon:
                action = np.random.choice(num_outputs, p=np.squeeze(dist))
            else:
                action = np.random.choice(num_outputs)
            
            log_prob = torch.log(p_d.squeeze(0)[action])
            entropy = -np.sum(np.mean(dist) * np.log(dist))
            new_state, reward, done, _ = env.step(action)
            
            rewards.append(reward)
            values.append(value)
            log_probs.append(log_prob)
            entropy_term += entropy
            state = new_state
            
            
            if done or steps == num_steps-1:
                Qval, _ = ac.forward(new_state)
                Qval = Qval.detach().numpy()[0,0]
                all_rewards.append(np.sum(rewards))
                all_lengths.append(steps)
                average_lengths.append(np.mean(all_lengths[-10:]))
                if episode % 10 == 0:                    
                    sys.stdout.write("episode: {}, reward: {}, total length: {}, average length: {} \n".format(episode, np.sum(rewards), steps, average_lengths[-1]))
                break
            
        Qvals = np.zeros_like(values)

        #iterate Q values
        for t in reversed(range(len(rewards))):
            Qval = rewards[t] + GAMMA * Qval
            Qvals[t] = Qval


        values = torch.FloatTensor(values)
        Qvals = torch.FloatTensor(Qvals)
        log_probs = torch.stack(log_probs)

        advantage = Qvals - values
        actor_loss = (-log_probs * advantage).mean()
        critic_loss = .5*advantage.pow(2).mean()
        ac_loss = actor_loss + critic_loss + .001*entropy_term

        ac_opt.zero_grad()
        ac_loss.backward()
        ac_opt.step()
    

env = gym.make("CartPole-v0")
train_actor_agent(env)
torch.save(ac.state_dict(), "./ac.weights")
            
            
    



episode: 0, reward: 14.0, total length: 13, average length: 13.0 
episode: 10, reward: 50.0, total length: 49, average length: 20.4 
episode: 20, reward: 17.0, total length: 16, average length: 21.5 
episode: 30, reward: 32.0, total length: 31, average length: 18.6 
episode: 40, reward: 31.0, total length: 30, average length: 18.3 
episode: 50, reward: 10.0, total length: 9, average length: 19.1 
episode: 60, reward: 39.0, total length: 38, average length: 25.8 
episode: 70, reward: 25.0, total length: 24, average length: 22.1 
episode: 80, reward: 12.0, total length: 11, average length: 23.8 
episode: 90, reward: 21.0, total length: 20, average length: 26.5 
episode: 100, reward: 9.0, total length: 8, average length: 20.2 
episode: 110, reward: 21.0, total length: 20, average length: 25.7 
episode: 120, reward: 13.0, total length: 12, average length: 23.5 
episode: 130, reward: 42.0, total length: 41, average length: 27.4 
episode: 140, reward: 13.0, total length: 12, average length: 

episode: 1200, reward: 136.0, total length: 135, average length: 132.0 
episode: 1210, reward: 200.0, total length: 199, average length: 146.7 
episode: 1220, reward: 152.0, total length: 151, average length: 139.5 
episode: 1230, reward: 110.0, total length: 109, average length: 154.2 
episode: 1240, reward: 166.0, total length: 165, average length: 154.4 
episode: 1250, reward: 200.0, total length: 199, average length: 135.7 
episode: 1260, reward: 25.0, total length: 24, average length: 143.4 
episode: 1270, reward: 145.0, total length: 144, average length: 130.5 
episode: 1280, reward: 187.0, total length: 186, average length: 108.5 
episode: 1290, reward: 33.0, total length: 32, average length: 137.9 
episode: 1300, reward: 177.0, total length: 176, average length: 147.2 
episode: 1310, reward: 54.0, total length: 53, average length: 83.3 
episode: 1320, reward: 115.0, total length: 114, average length: 83.3 
episode: 1330, reward: 103.0, total length: 102, average length: 102.0 


episode: 2350, reward: 165.0, total length: 164, average length: 174.9 
episode: 2360, reward: 161.0, total length: 160, average length: 109.1 
episode: 2370, reward: 160.0, total length: 159, average length: 147.5 
episode: 2380, reward: 121.0, total length: 120, average length: 148.3 
episode: 2390, reward: 130.0, total length: 129, average length: 170.9 
episode: 2400, reward: 172.0, total length: 171, average length: 136.1 
episode: 2410, reward: 175.0, total length: 174, average length: 168.4 
episode: 2420, reward: 168.0, total length: 167, average length: 157.7 
episode: 2430, reward: 44.0, total length: 43, average length: 159.5 
episode: 2440, reward: 39.0, total length: 38, average length: 128.5 
episode: 2450, reward: 98.0, total length: 97, average length: 129.7 
episode: 2460, reward: 46.0, total length: 45, average length: 145.8 
episode: 2470, reward: 89.0, total length: 88, average length: 170.6 
episode: 2480, reward: 31.0, total length: 30, average length: 136.6 
epis

In [35]:

rewards = []
ac = ActorCritic(num_inputs, num_outputs, hidden_size)
ac.load_state_dict(torch.load("./ac_weights"))
ac.eval()
for episode in range(max_episodes):
    state = env.reset()
    for steps in range(num_steps):
        value, p_d = ac.forward(state)
        value = value.detach().numpy()[0,0]
        dist = p_d.detach().numpy()

        #epsilon greedy action choice
        exp = np.random.uniform(0, 1)
        if exp >= epsilon:
            action = np.random.choice(num_outputs, p=np.squeeze(dist))
        else:
            action = np.random.choice(num_outputs)

        new_state, reward, done, _ = env.step(action)

        rewards.append(reward)
        state = new_state
        env.render()
        if done or steps == num_steps-1:
            if episode % 10 == 0:                    
                sys.stdout.write("episode: {}, reward: {}, total length: {}\n".format(episode, np.sum(rewards), steps))
            break
        

            



episode: 0, reward: 200.0, total length: 199
episode: 10, reward: 1880.0, total length: 47
episode: 20, reward: 3326.0, total length: 150
episode: 30, reward: 4882.0, total length: 157
episode: 40, reward: 6254.0, total length: 175
episode: 50, reward: 7959.0, total length: 171


KeyboardInterrupt: 