<a href="https://colab.research.google.com/github/abyssinia28/deeplearning/blob/master/CartPole_Problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The CartPole Problem

The agent in the cartpole problem is the cart. The observations include the cart position, cart velocity, pole angle, and pole velocity at the tip of the pole. The two possible actions that can be taken by the agent are: pushing the cart to the left or to the right. For every second the pole stays upright, the agent receives +1 as a reward, otherwise it is penalized by -1. 




# 1.   Importing libraries








In [6]:
import gym
import math
import random
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from collections import deque
from IPython import display
import matplotlib.pyplot as plt

Using TensorFlow backend.


# 2. Setting up the OpenAI gym environment

In [4]:
env = gym.make('CartPole-v0')

for i_episode in range(20): #the number of episodes that we will run
  observation = env.reset() #reset to start from the beginning
  for t in range(1000):
    #env.render()
    #print(observation)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)

    # if done:
    #   print("Episode finished after {} timesteps". format(t+1))
    #   break
    
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)



Discrete(2)
Box(4,)
[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]


# 3. Defining Parameters

In [0]:
#Training parameters

n_episodes = 500 
n_win_ticks = 195 
max_env_steps = None 


gamma = 1.0 #discount factor - consideration of future rewards - set 1 for same policies 
epsilon = 1.0 #exploration - choosing an action with best long term effect, 
#when 1.0 it chooses a uniformly random choice  
epsilon_min = 0.01
epsilon_decay = 0.995 #controls how quickly it stops exploring
alpha = 0.01 #learning rate - the new info overrides the old
#1.0 we only consider the recent result
alpha_decay = 0.01

batch_size = 64
monitor = False
quiet = False #controls our printing statements

#environment parameters

memory = deque(maxlen = 100000)
env = gym.make('CartPole-v0')
if max_env_steps is not None: env.max_episode_steps = max_env_steps


# 4. Building the neural network

In [0]:
#Model definition

model = Sequential()
model.add(Dense(48, input_dim = 4, activation = 'tanh')) #environment has 4 parameters
model.add(Dense(48, activation = 'tanh'))
model.add(Dense(2, activation = 'linear'))
model.compile(loss = 'mse', optimizer = Adam(lr = alpha, decay = alpha_decay))


# 5. Defining necessary functions

In [0]:
#Define necessary functions

def remember(state, action, reward, next_state, done):
  """ 
  Setting up the memory
  """
  memory.append((state, action, reward, next_state, done)) 
    
def choose_action(state, epsilon): 
    """
    pick what to do based on state and exploration factor
    returns action space sample - if the randomly generater number is less
    than epsilon, it will give the action space sample, if not, the model will
    make a prediction based off the current state
    """
    return env.action_space.sample() if (np.random.random() <= epsilon) \
        else np.argmax(model.predict(state))
def get_epsilon(t):
    """
    towards the end we will be decreasing substantially
    """
    return max(epsilon_min, min(epsilon, 1.0 - math.log10((t+1)*epsilon_decay)))

def preprocess_state(state):
    """
    make sure that it is in the right input format 
    """
    return np.reshape(state, [1,4])

def replay(batch_size, epsilon):
    x_batch, y_batch = [], []
    minibatch = random.sample(memory, min(len(memory), batch_size))
    
    for state, action, reward, next_state, done in minibatch:
        y_target = model.predict(state)
        y_target[0][action] = reward if done else reward + \
            gamma*np.max(model.predict(next_state)[0])
            #gives reward if predicted correctly
        x_batch.append(state[0])
        y_batch.append(y_target[0])
    model.fit(np.array(x_batch), np.array(y_batch), \
              batch_size = len(x_batch), verbose = 0) #training the model
    
    #update the epsilon - progressively get less explorative
    if epsilon > epsilon_min: 
        epsilon *= epsilon_decay

# 5. Defining a run function

In [0]:
#Define run function 
def run():
  """
  records the environment state and use the network to choose the best action
  to take
  """
  scores = deque(maxlen =100)
  for e in range(n_episodes): #e == episode
      state = preprocess_state(env.reset())
      done = False
      i = 0
      while not done:
          action = choose_action(state, get_epsilon(e))
          next_state, reward, done, _ = env.step(action)
          #env.render()
          next_state = preprocess_state(next_state)
          remember(state, action, reward, next_state, done)
          state = next_state
          i += 1
      
      scores.append(i)
      mean_score = np.mean(scores)
      
      if mean_score >= n_win_ticks and e >= 100:
          if not quiet: print('Ran {} episodes.\
                              Solved after {} trails'.format(e, e-100))
          return e - 100
      if e % 20 == 0 and not quiet:
          print('[Episode {}] - Mean survival time over last 100 episodes was \
          {} ticks'.format(e, mean_score))
                
      replay(batch_size, get_epsilon(e))
      
  if not quiet: print('Did not solve after {} episodes'. format(e))
  plt.plot(scores)
  return e

In [44]:
run()

[Episode 0] - Mean survival time over last 100 episodes was           15.0 ticks
[Episode 20] - Mean survival time over last 100 episodes was           153.23809523809524 ticks
[Episode 40] - Mean survival time over last 100 episodes was           123.1951219512195 ticks
[Episode 60] - Mean survival time over last 100 episodes was           116.54098360655738 ticks
[Episode 80] - Mean survival time over last 100 episodes was           128.7530864197531 ticks
[Episode 100] - Mean survival time over last 100 episodes was           123.86 ticks
[Episode 120] - Mean survival time over last 100 episodes was           99.14 ticks
[Episode 140] - Mean survival time over last 100 episodes was           86.43 ticks
[Episode 160] - Mean survival time over last 100 episodes was           76.15 ticks
[Episode 180] - Mean survival time over last 100 episodes was           49.68 ticks
[Episode 200] - Mean survival time over last 100 episodes was           36.06 ticks
[Episode 220] - Mean survival ti

235