In [1]:
import gym
env = gym.make('CartPole-v1')
''' 
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. 
The system is controlled by applying a force of +1 or -1 to the cart. 
The pendulum starts upright, and the goal is to prevent it from falling over. 
A reward of +1 is provided for every timestep that the pole remains upright.
The episode ends when the pole is more than 15 degrees from vertical, or the 
cart moves more than 2.4 units from the center.
'''

# env.reset()    #returns an initial observation

' \nA pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. \nThe system is controlled by applying a force of +1 or -1 to the cart. \nThe pendulum starts upright, and the goal is to prevent it from falling over. \nA reward of +1 is provided for every timestep that the pole remains upright.\nThe episode ends when the pole is more than 15 degrees from vertical, or the \ncart moves more than 2.4 units from the center.\n'

# Neural Net

In [2]:
import argparse
import os
import shutil
import time
import random
import numpy as np

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [12]:
args = dict({
    'lr': 0.001,
    'dropout': 0.3,
    'epochs': 1,
    'batch_size': 64,
    'cuda': torch.cuda.is_available(),
    'num_channels': 512,
})

In [1]:
# Training settings
batch_size = 64

class NNet(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        
        self.l1 = nn.Linear(4, 128)
        self.l2 = nn.Linear(128, 256)
        self.l3 = nn.Linear(256, 128)
        self.l4 = nn.Linear(128, 32)
        
        self.dp = nn.Dropout(p = 0.2)
        self.fc = nn.Linear(32, 2)    # want an action vector output: [log(prob right), log(prob left)]

    def forward(self, x):
        #in_size = x.size(0)
        x = F.relu(self.dp(self.l1(x)))
        x = F.relu(self.dp(self.l2(x)))
        x = F.relu(self.dp(self.l3(x)))
        x = F.relu(self.dp(self.l4(x)))
        
        #x = x.view(in_size, -1)  # flatten the tensor
        x = self.fc(self.dp(x))
        x = F.log_softmax(x, dim = -1)    # choose the dimension such that we get something like 
        return x                         # [exp(-0.6723) +  exp(-0.7144)] = 1 for the output   

    def train(self, examples):
        """
        examples: list of examples, each example is of form (state, pi, v)
        """
        optimizer = torch.optim.Adam(self.model.parameters())

        for epoch in range(args.epochs):
            print('EPOCH ::: ' + str(epoch+1))
            self.model.train()     # set module in training mode
            batch_idx = 0
            
            while batch_idx < int(len(examples)/args.batch_size):
                
                # --------------------- FORMAT DATA ----------------------
                
                # generate an array (of length batch_size) of random integers between 0 and len(examples)
                sample_ids = np.random.randint(len(examples), size=args.batch_size)
                
                # randomly rearrange selected examples and then transpose them
                states, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
                print("Shapes: ", states.shape, pis.shape, vs.shape)
                
                #convert states, policies and state-values to a tensors of floats
                states = torch.FloatTensor(np.array(states).astype(np.float64))
                target_pis = torch.FloatTensor(np.array(pis))
                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))

                # -------------------- PREDICT --------------------------
                if args.cuda:  #if we're using the GPU:
                    states, target_pis, target_vs = states.contiguous().cuda(), target_pis.contiguous().cuda(), target_vs.contiguous().cuda()
                states, target_pis, target_vs = Variable(states), Variable(target_pis), Variable(target_vs)
                

                # -------------------- FEED FORWARD ----------------------        
                out_pi, out_v = self.model(states)
            
                l_pi = self.loss_pi(target_pis, out_pi)
                l_v = self.loss_v(target_vs, out_v)
                total_loss = l_pi + l_v

                # record loss
                print("Pi loss: ", l_pi.data[0], states.size(0))
                print("V loss: ", l_v.data[0], states.size(0))

                # ----------- COMPUTE GRADS AND BACKPROP ----------------
                optimizer.zero_grad()
                total_loss.backward()
                optimizer.step()

                batch_idx += 1

    def loss_pi(self, targets, outputs):
        return -torch.sum(targets*outputs)/targets.size()[0]

    def loss_v(self, targets, outputs):
        return torch.sum((targets-outputs.view(-1))**2)/targets.size()[0]
            
            
            
    def predict(self, state):
        """
        state: np array with state
        """
        # timing
        start = time.time()

        # preparing input
        state = torch.FloatTensor(state.astype(np.float64))
        if args.cuda: 
            state = state.contiguous().cuda()
            
        #A Variable wraps a Tensor. Variable also provides a backward method to perform backpropagation
        state = torch.autograd.Variable(state, volatile=True)
        state = state.view(1, self.state_x, self.state_y)

        # sets mode to prediction
        self.model.eval()
        pi, v = self.model(state)

        #print('PREDICTION TIME TAKEN : {0:03f}'.format(time.time()-start))
        return torch.exp(pi).data.cpu().numpy()[0], v.data.cpu().numpy()[0]

model = NNet()   

# Monte-Carlo Tree Search

In [17]:
class MCTS():

    def __init__(self, nnet):
        self.nnet = nnet    #fuction handle
        self.c_puct = 0.1
        self.Qsa = {}       # stores Q values for s,a (as defined in the paper)
        self.Nsa = {}       # stores #times edge s,a was visited
        self.Ns = {}        # stores #times board s was visited
        self.Ps = {}        # stores initial policy (returned by neural net)

    def search(self, s, reward, done):
        # ---------------- TERMINAL STATE ---------------
        if done == True:
            return reward

        # ------------- EXPLORING FROM A LEAF NODE ----------------------
        #check if the state has a policy from it yet, if not then its a leaf
        if s not in self.Ps:
            self.Ps[s], v = self.nnet.predict(s)
            
            #check if the neural net has assigned a +ve prob to any policy
             
            sum_Ps_s = np.sum(self.Ps[s])
            if sum_Ps_s > 0:
                self.Ps[s] /= sum_Ps_s    # renormalize
            else:
                # if they were all zero then they are equally probable: (this shouldn't usually happen)
                # NB! All valid moves may = 0 if NNet architecture is insufficient or you've get overfitting or something else.
                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.   
                print("All valid moves were masked, do workaround.")
                self.Ps[s] = self.Ps[s] + valids
                self.Ps[s] /= np.sum(self.Ps[s])

            self.Ns[s] = 0
            return -v
        

        # ------------- GET BEST ACTION -----------------------------
        # search through the valid actions and update the UCB for all actions then update best acions
        max_u, best_a = -float("inf"), -1
        for a in range(1):
            if (s,a) in self.Qsa:
                u = self.Qsa[(s,a)] + self.cpuct*self.Ps[s][a]*np.sqrt(self.Ns[s])/(1+self.Nsa[(s,a)])
            else:
                u = self.cpuct*self.Ps[s][a]*math.sqrt(self.Ns[s] + 1e-8)     # Q = 0 ?
            
            if u > max_u:
                max_u = u
                best_a = a
        a = best_a

        
        # ----------- RECURSION TO NEXT STATE ------------------------
        sp, reward, done, info = env.step(a)
        v = self.search(sp, reward, done)
        

        # ------------ BACKUP Q-VALUES AND N_VISITED -----------------
        # after we reach the terminal condition then the stack unwinds and we
        # propagate up the tree backing up Q and N as we go
        if (s,a) in self.Qsa:
            self.Qsa[(s,a)] = (self.Nsa[(s,a)]*self.Qsa[(s,a)] + v)/(self.Nsa[(s,a)]+1)
            self.Nsa[(s,a)] += 1

        else:
            self.Qsa[(s,a)] = v
            self.Nsa[(s,a)] = 1

        self.Ns[s] += 1
        return -v


# Policy Iteration and Episode Execution

In [18]:
def executeEpisode(nnet):
    examples = []
    s, reward, done, info = env.reset()
    #mcts = MCTS(nnet)  # initialise the MCTS
    steps = 100
    tot_reward = steps
    
    print(s)
    for t in range(steps):
        env.render()
        #for _ in range(numMCTSSims):
        #    mcts.search(s, reward, done)
        
        # choose a random action, 0 (left) or 1 (right) for now
        action = env.action_space.sample()
        
        # observation/state s is the cart angle, or game image data from other envs
        # bounded by [+-2.4, +-inf, +-12degrees, +-inf]
        # which are the [position, velocity, angle, pole velocity at tip]
        s, reward, done, info = env.step(action)
        tot_reward -= reward
                
        #usually we add the rewards at the end of the episode
        #usually append mcts.pi(s) too
        examples.append([s, tot_reward])
        
        #a = random.choice(len(mcts.pi(s)), p = mcts.pi(s))
        #s = game.nextState(s, a)
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            print(examples)
            return examples

In [19]:
executeEpisode(0)

0.023818285864886532
Episode finished after 12 timesteps
[[array([ 0.02312713, -0.22913655, -0.03721234,  0.24926613]), 99.0], [array([ 0.0185444 , -0.03350348, -0.03222702, -0.05491822]), 98.0], [array([ 0.01787433,  0.16206538, -0.03332538, -0.35759229]), 97.0], [array([ 0.02111564, -0.03256734, -0.04047723, -0.07560117]), 96.0], [array([ 0.02046429,  0.16311081, -0.04198925, -0.38077503]), 95.0], [array([ 0.02372651,  0.35880308, -0.04960475, -0.68639591]), 94.0], [array([ 0.03090257,  0.55457725, -0.06333267, -0.99427377]), 93.0], [array([ 0.04199411,  0.7504866 , -0.08321814, -1.30615599]), 92.0], [array([ 0.05700385,  0.94655904, -0.10934126, -1.62368476]), 91.0], [array([ 0.07593503,  1.14278499, -0.14181496, -1.94834827]), 90.0], [array([ 0.09879073,  1.3391032 , -0.18078192, -2.28142334]), 89.0], [array([ 0.12557279,  1.14606143, -0.22641039, -2.04944041]), 88.0]]


[[array([ 0.02312713, -0.22913655, -0.03721234,  0.24926613]), 99.0],
 [array([ 0.0185444 , -0.03350348, -0.03222702, -0.05491822]), 98.0],
 [array([ 0.01787433,  0.16206538, -0.03332538, -0.35759229]), 97.0],
 [array([ 0.02111564, -0.03256734, -0.04047723, -0.07560117]), 96.0],
 [array([ 0.02046429,  0.16311081, -0.04198925, -0.38077503]), 95.0],
 [array([ 0.02372651,  0.35880308, -0.04960475, -0.68639591]), 94.0],
 [array([ 0.03090257,  0.55457725, -0.06333267, -0.99427377]), 93.0],
 [array([ 0.04199411,  0.7504866 , -0.08321814, -1.30615599]), 92.0],
 [array([ 0.05700385,  0.94655904, -0.10934126, -1.62368476]), 91.0],
 [array([ 0.07593503,  1.14278499, -0.14181496, -1.94834827]), 90.0],
 [array([ 0.09879073,  1.3391032 , -0.18078192, -2.28142334]), 89.0],
 [array([ 0.12557279,  1.14606143, -0.22641039, -2.04944041]), 88.0]]

In [20]:
def policyIterSP():
    nnet.initNNet()
    examples = []
    
    numIters, numEps = 10, 10
    for i in range(numIters):
        for e in range(numEps):
            examples += executeEpisode(nnet)
            
        new_nnet = trainNNet(examples)
        frac_win = pit(new_nnet, nnet)
        
        if frac_win > 0.55:
            nnet = new_nnet
            
    return nnet

# Neural Network