In [2]:
import gym
import numpy as np
import matplotlib.pyplot as plt
#import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import statistics
from collections import Counter

env = gym.make("CartPole-v0")
env.reset()
goal_steps = 500
score_requirement = 60
initial_games = 10000

# Execute Episodes (multiple)

In [None]:
def GenerateExamples(model = None):
    # [OBS, MOVES, SCORE]
    allExamples = np.array([])
    accepted_scores = np.array([])    # just the scores that met our threshold
    
    # --------------- ITERATE THROUGH 10000 EPISODE ------------------
    for _ in range(initial_games):
        # reset env to play again
        env.reset();    score = 0
        exampleGame = np.reshape( np.array([0, 0, 0, 0, 0, 0]), (1, 6)  )    # [obvservations, action, score]
        prev_observation = np.array([0, 0, 0, 0]) # list of 4 elements (our initial observation is all zeros)
              
        # --------- ITERATE UP TO 500 STEPS PER EPISODE -------------
        for _ in range(goal_steps):
                       
            # --------- GENERATE ACTION ------------
            # We can generate random actions or actions from the previous policy (i.e. prev nnet)
            if model == None or len(exampleGame) == 0:
                action = env.action_space.sample()   # choose random action (0-left or 1-right)
            else:
                x = torch.tensor(   prev_observation,   dtype = torch.float    )
                action_prob, e_score = model.forward(x)
                action = np.argmax(   action_prob.detach().numpy()   )                
                
            observation, reward, done, info = env.step(action)
            
            
            # --------- STORE STATE-ACTION PAIR + SCORE-(need to convert this to E[return]) ------------
            # this is the previous ovservation and the action taken from it
            t = np.append( prev_observation[0:4], [action, score] ) #creates (6,1) numpy array [obs, act, score]  
            #print(np.reshape(t, (1, 6)))
            if exampleGame == []:
            exampleGame = np.vstack( (exampleGame, np.reshape(t, (1, 6)))  )         
                
            prev_observation = np.array(observation)
            
            score += reward    # +1 for every frame we haven't fallen
            if done: 
                break

        print(exampleGame.shape)        
        print(exampleGame)        
                
                
        # --------- SAVE EXAMPLE (EPISODE) IF (SCORE > THRESHOLD) ----------
        # This saves state action pairs in a np.array(np[4x1 state array], np[1x1 action])
        # Note, it does not save the score! Therefore all episodes with score > threshold
        # are treated equally (not the best way of doing this!)
        #if score >= score_requirement:
        #    exampleGame = np.append(exampleGame[0:2], score - exampleGame[2])
        #    allExamples = np.append(   allExamples, exampleGame   )

            #accepted_scores.append(score)
            #a, b = np.array(exampleGame)[:, 0:2], np.array(score) - np.array(exampleGame)[:, 2]
            #allExamples.append(np.hstack(  (a, np.reshape(b, (len(b), 1)))  )) # shape (72,) != (72, 1)
            
            
    # just in case you wanted to reference later
    #training_data_save = np.array(examples)
    #np.save('saved.npy',training_data_save)
    avg_mean, avg_median = statistics.mean(accepted_scores), statistics.median(accepted_scores)
    
    # some stats here, to further illustrate the neural network magic!
    print('Average accepted score: ', avg_mean)
    print('Median score for accepted scores: ', avg_median)
    print(Counter(accepted_scores))
    
    #print(allExamples[0])
    # Examples are of the form:
    #[array([ [array([0, 0, 0, 0]), 1, 84.0],
    #         [array([ 0.01912502,  0.19371378,  0.04909047, -0.24309246]), 0, 83.0],
    #         [array([ 0.02299929, -0.00207375,  0.04422862,  0.06466183]), 1, 82.0],
    
    # allExamples[0] =
    # [[array([0, 0, 0, 0]) 1 64.0]
    # [array([-0.04165018,  0.18181016, -0.03935655, -0.33425807]) 0 63.0]
    # [array([-0.03801397, -0.01273019, -0.04604171, -0.05424134]) 0 62.0]
    
    return allExamples[0], avg_mean, avg_median

In [None]:
training_data, mean, median = GenerateExamples()

# Create Policy (Neural Net model)

In [None]:
# Training settings
batch_size = 64

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        
        self.l1 = nn.Linear(4, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, 128)
        self.l4 = nn.Linear(128, 32)
        
        self.dp = nn.Dropout(p = 0.3)  # Suragnair used 0.3
        self.fc1 = nn.Linear(32, 2)    # want an action vector output: [log(prob right), log(prob left)]
        self.fc2 = nn.Linear(32, 1)    # Output the expected return

    def forward(self, obs):
        #in_size = x.size(0)
        x = F.relu(self.dp(self.l1(obs)))
        x = F.relu(self.dp(self.l2(x)))
        x = F.relu(self.dp(self.l3(x)))
        x = F.relu(self.dp(self.l4(x)))
        
        #x = x.view(in_size, -1)  # flatten the tensor
        a = self.fc1(self.dp(x))
        action_probs = F.log_softmax(a, dim = -1)    # choose the dimension such that we get something like 
                                          # [exp(-0.6723) +  exp(-0.7144)] = 1 for the output
        v = self.fc2(self.dp(x))  # get a linear value for the expected return
        return action_probs, v                      

In [None]:
model = Net() # initialise the neural net

# Train Model (Policy Evaluation)

In [1]:
def train_model(epoch, examples):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    action_loss, value_loss, accuracy = [], [], []
    
    # ------------- CONVERT TO CORRECT DATA TYPE ----------------
    gpu = torch.device("cpu")
    print(examples[0])
    print(examples[:, 0])
    
    #allStates= torch.tensor(  np.array([i[0] for i in examples]),  dtype = torch.float, device = gpu)       #reshapes into a (23002, 4) array
    #allActions = torch.tensor(  np.array([i[1] for i in examples]), dtype = torch.long, device = gpu)    #reshapes into a (23002, 2) array 
    #allReturns = torch.tensor(  np.array([i[2] for i in examples]),  dtype = torch.float, device = gpu) 
    
    # We should permute data before batching really. (X is a torch Variable)
    #permutation = torch.randperm(X.size()[0])

    
    for index in range(0, allStates.size()[0], batch_size):        

        # -------- GET BATCHES -----------
        #indices = permutation[i:i+batch_size]
        batch_idx = int(index / batch_size) + 1 #add one so stats print properly
        batch_states = allStates[index : index+batch_size] # torch.Size([64, 4])
        batch_actions = allActions[index : index+batch_size] # torch.Size([64])
        batch_returns = allReturns[index: index+batch_size] # torch.Size([64])

        # --------- TRAIN & BACKPROP ----------
        optimizer.zero_grad()
        pred_actions, state_value = model(batch_states) # torch.Size([64, 2]) and torch.Size([64, 1])
        
        a_loss = F.nll_loss(pred_actions, batch_actions)
        # Suragnair uses tanh for state_values, but their values are E[win] = [-1, 1] where -1 = loss
        # Here we are using the length of time that we have been "up"
        v_loss = F.binary_cross_entropy(torch.sigmoid(state_value[:, 0]), torch.sigmoid(batch_returns))
        
        action_loss.append(a_loss)
        value_loss.append(v_loss)
        tot_loss = a_loss + v_loss
        
        # Should try and get this to work with one hot!
        tot_loss.backward()
        optimizer.step()
                
        # --------- PRINT STATS --------------
        # Get array of predicted actions and compare with target actions to compute accuracy
        arg = torch.argmax(pred_actions, dim = 1)
        accuracy.append(  1 - (torch.abs(arg - batch_actions).sum().detach().numpy()) / batch_size    ) #counts the different ones
        
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} \tAccuracy: {:.5f}'.format(
                epoch, 
                batch_idx * batch_size, 
                allStates.size()[0],
                100 * batch_idx * batch_size / allStates.size()[0], 
                tot_loss,
                accuracy[batch_idx - 1])
                  
             )

    return model, action_loss, value_loss, accuracy

In [None]:
training_data, mean, median = GenerateExamples()

In [None]:
new_net, a_loss, v_loss, batch_acc = train_model(1, training_data)

# Policy Iteration

In [None]:
def policyIteration():
    nnet = Net()
    
    for i in range(10):
        training_data, mean, median = GenerateExamples(nnet)
        new_nnet, a_loss, v_loss, batch_acc = train_model(1, training_data)
        new_data, new_mean, new_median = GenerateExamples(new_nnet)
        
        if new_mean > mean and new_median > median:
            nnet = new_nnet
            print("policy updated!")
        
    return nnet

In [None]:
model = policyIteration()