# CartPole-v1 with TensorFlow2

Problem [gym.openai.com/envs/CartPole-v1/](https://gym.openai.com/envs/CartPole-v1/)

## Setup

In [0]:
import numpy as np
import os
import pandas as pd
import random

In [2]:
try:
    import gym
except:
    !pip install gym
import gym
print(gym.__version__)

0.10.11


## Visualize gym environment - random action

In [3]:
import gym
env = gym.make("CartPole-v1")
no_actions = env.action_space.n
no_observations = env.observation_space.shape[0]
print(no_actions)
print(no_observations)

2
4


Action and observation interpretations: [github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py](https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py)

In [4]:
for i in range(1):
    state = env.reset()
    for t in range(100):
    #     env.render()
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        print(state)
            
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

[ 0.03417177 -0.21310294 -0.00661476  0.27387159]
[ 0.02990971 -0.01788724 -0.00113733 -0.02089029]
[ 0.02955197  0.17725101 -0.00155514 -0.31393184]
[ 0.03309699 -0.01784876 -0.00783377 -0.02173976]
[ 0.03274001  0.17738466 -0.00826857 -0.316884  ]
[ 0.0362877   0.3726234  -0.01460625 -0.61216303]
[ 0.04374017  0.56794641 -0.02684951 -0.90941037]
[ 0.0550991   0.76342127 -0.04503772 -1.21040979]
[ 0.07036753  0.95909488 -0.06924591 -1.51685919]
[ 0.08954942  1.15498281 -0.0995831  -1.83032894]
[ 0.11264908  1.35105676 -0.13618967 -2.15221182]
[ 0.13967022  1.15751141 -0.17923391 -1.90449809]
[ 0.16282044  0.96472272 -0.21732387 -1.6723607 ]
Episode finished after 13 timesteps


## Train with tabular Q-Learning

In [5]:
q = pd.DataFrame(np.random.random((10000, 2)), columns=range(no_actions))
q.index = ["{0:0>4}".format(i) for i in range(10000)]
q = q / 100 # scale values to [0, 0.01)
q.head(5)

Unnamed: 0,0,1
0,0.001611,0.00973
1,0.000385,0.005382
2,0.009447,0.004569
3,0.009794,0.007659
4,0.009304,0.004674


In [0]:
def bin_state_idx(state):
    """
    Min-max definitions: github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py
    """
    x_no_bins = 3
    x = pd.cut([-4.8, state[0], 4.8], bins=x_no_bins, labels=range(x_no_bins))
    x_vel_no_bins = 3
    x_vel = pd.cut([-5, state[1], 5], bins=x_vel_no_bins, labels=range(x_vel_no_bins))
    theta_no_bins = 6
    theta = pd.cut([-0.24, state[2], 0.24], bins=theta_no_bins, labels=range(theta_no_bins))
    theta_vel_no_bins = 3
    theta_vel = pd.cut([-5, state[3], 5], bins=theta_vel_no_bins, labels=range(theta_vel_no_bins))
    return "{}{}{}{}".format(x[1], x_vel[1], theta[1], theta_vel[1]) 

In [0]:
def get_best_action(state):
    state_idx = bin_state_idx(state)
    q_vals = q.loc[state_idx, :]
    action = q.loc[state_idx, :].idxmax()
    return action, q_vals

In [0]:
ALPHA = 0.05
GAMMA = 0.95
def fit_model(state, action, reward, next_state, done):
    state_action, state_q = get_best_action(state)
    state_idx = bin_state_idx(state)
    next_state_action, next_state_q = get_best_action(next_state)
    q_update = reward
    if not done:
        q_update =  (1-ALPHA) * q.loc[state_idx, action] + ALPHA * (reward + GAMMA * next_state_q[next_state_action])
    q.loc[state_idx, action] = q_update

In [0]:
max_ep = 5000 #1500
for ep in range(max_ep):
    state = env.reset()
    for t in range(100):
#         env.render()
        if random.random() < 0.05:
            action = env.action_space.sample()
        else:
            action, _ = get_best_action(state)
        next_state, reward, done, info = env.step(action)
        fit_model(state, action, reward, next_state, done)
        state = next_state
        if done:
#             print("Episode finished after {} timesteps".format(t+1))
            break

## Test

In [10]:
# env.render()
for ep in range(5):
    state = env.reset()
    for t in range(1000):
        action, _ = get_best_action(state)
        next_state, reward, done, info = env.step(action)
        if done:
            print("Episode {} finished after {} timesteps".format(ep, t))
            break
        state = next_state
env.close()

Episode 0 finished after 9 timesteps
Episode 1 finished after 9 timesteps
Episode 2 finished after 9 timesteps
Episode 3 finished after 10 timesteps
Episode 4 finished after 10 timesteps
