In [55]:
import sys
import gym
import time
import random
import numpy as np
from collections import defaultdict

`Observation`: 

        Type: Box(4)
        Num	Observation                 Min         Max
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24 deg        24 deg
        3	Pole Velocity At Tip      -Inf            Inf
        
`Actions`:

        Type: Discrete(2)
        Num	Action
        0	Push cart to the left
        1	Push cart to the right
        
Note: The amount the velocity that is reduced or increased is not fixed; it depends on the angle the pole is pointing. This is because the center of gravity of the pole increases the amount of energy needed to move the cart underneath it
    
Reward:

        Reward is 1 for every step taken, including the termination step
        
Starting State:

        All observations are assigned a uniform random value in [-0.05..0.05]
        
Episode Termination:

        Pole Angle is more than 12 degrees
        Cart Position is more than 2.4 (center of the cart reaches the edge of the display)
        Episode length is greater than 200
        Solved Requirements
        Considered solved when the average reward is greater than or equal to 195.0 over 100 consecutive trials.

In [2]:
env = gym.make('CartPole-v0')

In [4]:
for i_episode in range(1):
    observation = env.reset()
    act=1
    for t in range(100):
        env.render()
        action = env.action_space.sample()
        print(action)
        observation, reward, done, info = env.step(act)
        print(observation,reward,done,info)
        #time.sleep(1)
        act = random.randint(0,1)
        if done:
            print('reward: ', reward)
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

1
[-0.02037456  0.21687341 -0.00758537 -0.25880171] 1.0 False {}
1
[-0.01603709  0.02186057 -0.0127614   0.03147907] 1.0 False {}
1
[-0.01559988  0.21716318 -0.01213182 -0.26520274] 1.0 False {}
1
[-0.01125662  0.02221646 -0.01743588  0.02362914] 1.0 False {}
0
[-0.01081229  0.21758407 -0.01696329 -0.2745036 ] 1.0 False {}
0
[-0.00646061  0.41294389 -0.02245336 -0.57248815] 1.0 False {}
1
[ 0.00179827  0.21814385 -0.03390313 -0.28696248] 1.0 False {}
0
[ 0.00616115  0.02352139 -0.03964238 -0.00516215] 1.0 False {}
0
[ 0.00663157 -0.17101027 -0.03974562  0.27475428] 1.0 False {}
0
[ 0.00321137 -0.36554327 -0.03425053  0.55464119] 1.0 False {}
0
[-0.0040995  -0.16995756 -0.02315771  0.251367  ] 1.0 False {}
0
[-0.00749865  0.0254873  -0.01813037 -0.04852944] 1.0 False {}
1
[-0.0069889   0.22086447 -0.01910096 -0.34687706] 1.0 False {}
0
[-0.00257161  0.02601934 -0.0260385  -0.06027803] 1.0 False {}
1
[-0.00205123 -0.16871978 -0.02724406  0.22407718] 1.0 False {}
1
[-0.00542562 -0.3634419

In [5]:
print(env.observation_space)
print(env.action_space)

Box(4,)
Discrete(2)


In [6]:
env.reset()

array([-0.00638655, -0.02679715, -0.01659051, -0.01061832])

In [40]:
def generate_episode_from_limit(cartPole_env):
    episode = []
    state = cartPole_env.reset()
    while True:
        #bad test policy
        action = env.action_space.sample()
        next_state, reward, done, info = cartPole_env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

In [42]:
#for i in range(3):
#    print(generate_episode_from_limit(env))

In [28]:
def discritize(CVB=(-3, 3), PVB=(-3, 3)):
    """
    CVB: default cart velocity bounds = -1, 1
    PVB: default pole velocity bounds = -1, 1
    
    For Cartpole, we have:

        Type: Box(4)
        Num	Observation                 Min         Max
        0	Cart Position             -4.8            4.8
        1	Cart Velocity             -Inf            Inf
        2	Pole Angle                 -24 deg        24 deg
        3	Pole Velocity At Tip      -Inf            Inf
    """
    assert(CVB[0] < CVB[1] and PVB[0] < PVB[1])
    
    cart_position_space = np.linspace(-4.8, 4.8, 25)
    cart_velocity_space = np.linspace(CVB[0], CVB[1], 15)
    pole_angle_space = np.linspace(-24, 24, 25)
    pole_velocity_space = np.linspace(PVB[0], PVB[1], 15)
    
    return cart_position_space, cart_velocity_space, pole_angle_space, pole_velocity_space

In [29]:
cps, cvs, pas, pvs = discritize()

In [32]:
s0 = env.reset()

In [35]:
np.argmin(abs(s0[1] - cvs))

7

In [47]:
def getDiscreteStateFromObs(obs):
    cartPos = np.argmin(abs(obs[0] - cps))
    cartVel = np.argmin(abs(obs[1] - cvs))
    poleAng = np.argmin(abs(obs[2] - pas))
    poleVel = np.argmin(abs(obs[3] - pvs))
    return (cartPos, cartVel, poleAng, poleVel)

In [44]:
ep = generate_episode_from_limit(env)

In [45]:
ep[0]

(array([ 0.0241418 ,  0.0473015 ,  0.03660755, -0.04393504]), 1, 1.0)

In [48]:
getDiscreteStateFromObs(ep[0][0])

(12, 7, 12, 7)

In [53]:
def mc_prediction_v(env, num_episodes, generate_episode, gamma=1.0):
    # initialize empty dictionary of lists
    returns = defaultdict(np.float64)
    N = defaultdict(int)
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 1000 == 0:
            print("\rEpisode {}/{}.".format(i_episode, num_episodes), end="")
            sys.stdout.flush()
        
        episode = generate_episode(env)
        seen = []
        for t, (state, action, reward) in enumerate(episode):
            cState = getDiscreteStateFromObs(state)
            if cState not in seen:
                seen.append(cState)
                N[cState] += 1
                G = 0
                for state, action, reward in episode[t:]:
                    G += reward
                returns[cState] += G        
        ## TODO: complete the function
    V = defaultdict(np.float64)
    for k in N.keys():
        V[k] = returns[k] / N[k]
    return V

In [56]:
V = mc_prediction_v(env, 10000, generate_episode_from_limit)

Episode 10000/10000.

In [57]:
V

defaultdict(numpy.float64,
            {(12, 7, 12, 7): 22.1823,
             (12, 6, 12, 8): 20.132210788899283,
             (12, 6, 12, 9): 15.022915101427499,
             (12, 8, 12, 6): 19.859346365302383,
             (12, 7, 12, 8): 20.654341127424324,
             (12, 7, 12, 9): 5.357348703170029,
             (12, 7, 12, 6): 20.684421014361025,
             (12, 5, 12, 10): 8.080988693136996,
             (12, 8, 12, 5): 14.870724531077602,
             (12, 8, 12, 4): 3.265705458290422,
             (12, 9, 12, 3): 2.6946702800361337,
             (12, 6, 12, 7): 19.029735682819382,
             (12, 5, 12, 9): 16.805256869773,
             (12, 4, 12, 11): 3.8203309692671397,
             (12, 4, 12, 12): 1.7682403433476395,
             (11, 3, 12, 12): 1.7586206896551724,
             (12, 9, 12, 4): 8.19243632621559,
             (12, 10, 12, 3): 3.5917893106119285,
             (12, 10, 12, 2): 1.7707889125799574,
             (12, 5, 12, 11): 2.7387173396674585,
     