# Q Learning on Mountain Car Environment

In [1]:
!pip install gym #already installed



In [2]:
import gym
env = gym.make('MountainCar-v0') #for discription https://github.com/openai/gym/wiki/MountainCar-v0
env.reset()

array([-0.50497914,  0.        ])

In [3]:
import random
done = False

while not done:
  action = (int(random.random()*10))%3 #action 0->push car left; action 1->do nothing; action 2-> push car right
  state, reward, done, _ = env.step(action)# state is a tuple of (position, velocity)
  print(state)  

  env.render()

env.close()

#the car needs to build momentum

[-5.05118714e-01 -1.39574660e-04]
[-5.05396818e-01 -2.78104130e-04]
[-0.50681137 -0.00141455]
[-0.50735177 -0.0005404 ]
[-0.50801398 -0.00066221]
[-5.07793031e-01  2.20949028e-04]
[-0.50869058 -0.00089755]
[-5.08699905e-01 -9.32382648e-06]
[-0.50982093 -0.00112103]
[-0.51104527 -0.00122433]
[-0.51236373 -0.00131846]
[-0.51476644 -0.00240271]
[-0.51823538 -0.00346894]
[-0.52274455 -0.00450917]
[-0.52626012 -0.00351557]
[-0.52875573 -0.00249561]
[-0.53121267 -0.00245694]
[-0.53361251 -0.00239984]
[-0.53493726 -0.00132475]
[-0.53617698 -0.00123973]
[-5.36322394e-01 -1.45410910e-04]
[-0.5353724   0.00094999]
[-0.53333412  0.00203828]
[-0.53222284  0.00111128]
[-5.32046888e-01  1.75954915e-04]
[-5.31807579e-01  2.39308974e-04]
[-0.53250671 -0.00069913]
[-0.53313904 -0.00063233]
[-0.53369983 -0.00056079]
[-5.34184868e-01 -4.85041163e-04]
[-0.53559053 -0.00140566]
[-0.53790627 -0.00231574]
[-0.54111473 -0.00320847]
[-0.54419189 -0.00307716]
[-0.5461147  -0.00192281]
[-0.54886877 -0.00275407]


## Implementing Q Table

We will be implementing SARSAMAX

![](SARSAMAX.png)

- State contains continous values
- Q Table will contains prohibitively large number of state values
- We need to **dicretize them** in order to create Q Table
- Initially Q Table will contain random values

In [4]:
#display observation space low and high values
print(env.observation_space.high)#[position,velocity]
print(env.observation_space.low)#[position, velocity]
print(env.action_space.n)

[0.6  0.07]
[-1.2  -0.07]
3


In [5]:
DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)#os->observation space; 20 is the bucket size chosen by us (hyperparameter)
DISCRETE_OS_SIZE # We will have 20 discrete combinations for position and velocity. 
#State space will be 20 * 20 = 400

[20, 20]

In [6]:
#For each of 20 discrete state, we will have a bucket containing some range of position values and some range of velocity value
discrete_os_bucket_size = (env.observation_space.high-env.observation_space.low)/DISCRETE_OS_SIZE
discrete_os_bucket_size

array([0.09 , 0.007])

In [7]:
#creating Q Table
import numpy as np
#Q Table is assigned arbitrarily

#q_table = np.random.uniform(low=-2,high=0,size=(DISCRETE_OS_SIZE + [3])) #uniform is used when we want random values drawn from [low,high) range

q_table = np.zeros(DISCRETE_OS_SIZE + [3])

#low and high are randomly chosen
print(q_table.shape)
q_table #Q Table is of 3 Dimension  poition_bucket (20) * velocity_bucket (20) * action (3)

(20, 20, 3)


array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       ...,

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        ...,
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

## Now we will implement SARSAMAX algorithm to fill the Q Table

In [8]:
LEARNING_RATE = 0.1 #hyperparameter 
DISCOUNT = 0.95 #hyperparameter; how important are future rewards  
EPISODES = 25000 #hyperparameter
RENDER_EVERY = 5000

#a helper function to discretize continous states as returned by the environment 
def get_discrete_state(state):
    #discrete_state -> floor((current_state- lowest_state)/bucket_size)
    discrete_state = (state - env.observation_space.low) / discrete_os_bucket_size
    return tuple(discrete_state.astype(np.int)) #taking the floor value

In [9]:
# Exploration settings
epsilon = 1  # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

In [10]:
for episode in range(EPISODES):
    done = False
    discrete_state = get_discrete_state(env.reset()) #we always start with initial discrete state
    if episode % RENDER_EVERY == 0:
        render=True
        print(episode)
    else:
        render=False
        
    while not done:
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)
        state, reward, done, _ = env.step(action)# state is a tuple of (position, velocity)
        if render:
            env.render()
        new_discrete_state = get_discrete_state(state)
        if not done:
            q_st_at=q_table[discrete_state+(action,)]
            q_st_plus_1_a = np.max(q_table[new_discrete_state])
            new_q_st_at = q_st_at + LEARNING_RATE * (reward + DISCOUNT * q_st_plus_1_a - q_st_at)
            q_table[discrete_state+(action,)] = new_q_st_at
        elif state[0] >= env.goal_position: #if the episode terminates because we have achieved our goal
            new_q_st_at = q_st_at + LEARNING_RATE * (0 + DISCOUNT * q_st_plus_1_a - q_st_at)
        discrete_state = new_discrete_state
     # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
env.close()

0


KeyboardInterrupt: 

# QLearning on Cartpole Environment

In [11]:
!pip install gym #already installed



In [12]:
import gym
env = gym.make('CartPole-v0') #for discription https://github.com/openai/gym/wiki/CartPole-v0
env.reset()

array([-0.01116547, -0.03905466, -0.00997557,  0.04651892])

In [13]:
import random
done = False

while not done:
  action = (int(random.random()*10))%2 #action 0->push cart left; action 1->push cart right
  state, reward, done, _ = env.step(action)# state is a tuple of (cart position, cart velocity, pole angle, pole velocity at tip)
  print(state)  
  env.render()

env.close()

#the cart need to balance the pole as much as possible

[-0.01194657 -0.23403216 -0.00904519  0.33603785]
[-0.01662721 -0.42902422 -0.00232443  0.62585474]
[-0.0252077  -0.2338699   0.01019267  0.33244066]
[-0.02988509 -0.42913544  0.01684148  0.62832034]
[-0.0384678  -0.23425253  0.02940789  0.3409886 ]
[-0.04315285 -0.42978028  0.03622766  0.64279809]
[-0.05174846 -0.23518152  0.04908362  0.36174007]
[-0.05645209 -0.43096552  0.05631842  0.66948717]
[-0.0650714  -0.23666998  0.06970816  0.3950548 ]
[-0.0698048  -0.04260279  0.07760926  0.1251386 ]
[-0.07065685 -0.23874586  0.08011203  0.44126181]
[-0.07543177 -0.4349047   0.08893727  0.75808413]
[-0.08412987 -0.63113235  0.10409895  1.07737538]
[-0.09675251 -0.82746369  0.12564646  1.400828  ]
[-0.11330179 -0.63410706  0.15366302  1.14992238]
[-0.12598393 -0.4412874   0.17666147  0.90910022]
[-0.13480968 -0.24893964  0.19484347  0.67673789]
[-0.13978847 -0.44615829  0.20837823  1.02389042]
[-0.14871163 -0.25432815  0.22885604  0.8031906 ]


## Implementing Q Table

We will be implementing SARSAMAX

![](SARSAMAX.png)

- State contains continous values
- Q Table will contains prohibitively large number of state values
- We need to **dicretize them** in order to create Q Table
- Initially Q Table will contain random values

In [15]:
#display observation space low and high values
print(env.observation_space.high)#[position,cart_velocity,pole_angle,pole_velocity]
print(env.observation_space.low)#[position,cart_velocity,pole_angle,pole_velocity]
print(env.action_space.n)

[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
2


In [16]:
#Reference 1 for discretizizng -> https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947 
#Reference 2 for discretizing -> https://github.com/deepakkavoor/cartpole-rl/blob/master/cartpole-q_learning.py
import math
env.observation_space.high[1]= 0.5
env.observation_space.low[1]= -0.5
env.observation_space.high[3]= math.radians(50)
env.observation_space.low[3]= -math.radians(50)

In [17]:
import numpy as np
LEARNING_RATE = 0.1 #hyperparameter 
DISCOUNT = 0.95 #hyperparameter; how important are future rewards  
EPISODES = 20000 #hyperparameter
RENDER_EVERY = 1000



#Reference for discretizing=> https://mc.ai/openai-gyms-cart-pole-balancing-using-q-learning/
buckets=(1, 1, 6, 12,) #hyperparameter -> dimensions of Q Table (position,cart_velocity,pole_angle,pole_velocity)
q_table = np.zeros(buckets + (env.action_space.n,))
print(q_table.shape)



#a helper function to discretize continous states as returned by the environment 
def get_discrete_state(state):
    ratios = [(state[i] + abs(env.observation_space.low[i])) / (env.observation_space.high[i] - env.observation_space.low[i]) for i in range(len(state))]
    new_obs = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    new_obs = [min(buckets[i] - 1, max(0, new_obs[i])) for i in range(len(state))]
    return tuple(new_obs)
    
    
    



(1, 1, 6, 12, 2)


In [18]:
# Exploration settings
epsilon = 1  # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

In [19]:
for episode in range(EPISODES):
    done = False
    discrete_state = get_discrete_state(env.reset()) #we always start with initial discrete state'
    if episode % RENDER_EVERY == 0:
        render=True
        print(episode)
    else:
        render=False
    steps_per_episode = 0
    while not done:
        
        if render:
            env.render()
            steps_per_episode+=1
            print('Step Count'+str(steps_per_episode))
        
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)
        
        state, reward, done, _ = env.step(action)# state is a tuple of (position,cart_velocity,pole_angle,pole_velocity)
        new_discrete_state = get_discrete_state(state)
        if not done:
            q_st_at=q_table[discrete_state+(action,)]
            q_st_plus_1_a = np.max(q_table[new_discrete_state])
            new_q_st_at = q_st_at + LEARNING_RATE * (reward + DISCOUNT * q_st_plus_1_a - q_st_at)
            q_table[discrete_state+(action,)] = new_q_st_at
        discrete_state = new_discrete_state
     # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
env.close()

0
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
Step Count24
Step Count25
Step Count26
Step Count27
Step Count28
Step Count29
1000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
2000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
3000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step 

9000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
Step Count24
Step Count25
Step Count26
Step Count27
Step Count28
Step Count29
Step Count30
Step Count31
Step Count32
Step Count33
Step Count34
Step Count35
Step Count36
Step Count37
Step Count38
Step Count39
Step Count40
Step Count41
Step Count42
Step Count43
Step Count44
Step Count45
Step Count46
Step Count47
Step Count48
Step Count49
Step Count50
Step Count51
Step Count52
Step Count53
Step Count54
Step Count55
Step Count56
Step Count57
Step Count58
Step Count59
Step Count60
Step Count61
Step Count62
Step Count63
Step Count64
Step Count65
Step Count66
Step Count67
Step Count68
Step Count69
Step Count70
Step Count71
Step Count72
Step Count73
Step Count74
Step Count75
Step Count76
Step Count77
Ste

Step Count135
Step Count136
Step Count137
Step Count138
Step Count139
Step Count140
Step Count141
Step Count142
Step Count143
Step Count144
Step Count145
Step Count146
Step Count147
Step Count148
Step Count149
Step Count150
Step Count151
Step Count152
Step Count153
Step Count154
Step Count155
Step Count156
Step Count157
Step Count158
Step Count159
Step Count160
Step Count161
Step Count162
Step Count163
Step Count164
Step Count165
Step Count166
Step Count167
Step Count168
Step Count169
Step Count170
Step Count171
Step Count172
Step Count173
Step Count174
Step Count175
Step Count176
Step Count177
Step Count178
Step Count179
Step Count180
Step Count181
Step Count182
Step Count183
Step Count184
Step Count185
Step Count186
Step Count187
Step Count188
Step Count189
Step Count190
Step Count191
Step Count192
Step Count193
Step Count194
Step Count195
Step Count196
Step Count197
Step Count198
Step Count199
Step Count200
13000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count

Step Count117
Step Count118
Step Count119
Step Count120
Step Count121
Step Count122
Step Count123
Step Count124
Step Count125
Step Count126
Step Count127
Step Count128
Step Count129
Step Count130
Step Count131
Step Count132
Step Count133
Step Count134
Step Count135
Step Count136
Step Count137
Step Count138
Step Count139
Step Count140
Step Count141
Step Count142
Step Count143
Step Count144
Step Count145
Step Count146
Step Count147
Step Count148
Step Count149
Step Count150
Step Count151
17000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
Step Count24
Step Count25
Step Count26
Step Count27
Step Count28
Step Count29
Step Count30
Step Count31
Step Count32
Step Count33
Step Count34
Step Count35
Step Count36
Step Count37
Step Count38
Step Count39
Step C

## Sample Run of Agent

In [None]:

done = False
discrete_state = get_discrete_state(env.reset()) #we always start with initial discrete state'
steps=0
while not done:
    steps+=1
    print(steps)
    env.render()
        
    action = np.argmax(q_table[discrete_state])
        
    state, reward, done, _ = env.step(action)# state is a tuple of (position,cart_velocity,pole_angle,pole_velocity)

    discrete_state=get_discrete_state(state)

# Q Learning Hyperparameter Optimization -> Finding Appropriate Number of Episodes

In [20]:
import gym
env = gym.make('CartPole-v0') #for discription https://github.com/openai/gym/wiki/CartPole-v0
env.reset()

array([ 0.0138466 , -0.04644006,  0.02646106,  0.0258426 ])

In [21]:
import random
done = False

while not done:
  action = (int(random.random()*10))%2 #action 0->push cart left; action 1->push cart right
  state, reward, done, _ = env.step(action)# state is a tuple of (cart position, cart velocity, pole angle, pole velocity at tip)
  print(state)  
  env.render()

env.close()

#the cart need to balance the pole as much as possible

[ 0.0129178   0.14829262  0.02697791 -0.25837552]
[ 0.01588365 -0.04720388  0.0218104   0.04269307]
[ 0.01493957  0.14759863  0.02266427 -0.24302946]
[ 0.01789155  0.34238964  0.01780368 -0.52847821]
[ 0.02473934  0.1470218   0.00723411 -0.23023888]
[ 0.02767977  0.34203963  0.00262933 -0.52063116]
[ 0.03452057  0.53712447 -0.00778329 -0.81248439]
[ 0.04526306  0.73235216 -0.02403298 -1.10760532]
[ 0.0599101   0.92778163 -0.04618508 -1.40772988]
[ 0.07846573  0.73326218 -0.07433968 -1.12983543]
[ 0.09313098  0.53918828 -0.09693639 -0.86136435]
[ 0.10391474  0.73548714 -0.11416368 -1.182885  ]
[ 0.11862448  0.54201648 -0.13782138 -0.92805938]
[ 0.12946481  0.73870297 -0.15638256 -1.26068288]
[ 0.14423887  0.54588812 -0.18159622 -1.02077925]
[ 0.15515664  0.35358898 -0.20201181 -0.79016993]
[ 0.16222842  0.5508267  -0.21781521 -1.13899858]


In [22]:
#display observation space low and high values
print(env.observation_space.high)#[position,cart_velocity,pole_angle,pole_velocity]
print(env.observation_space.low)#[position,cart_velocity,pole_angle,pole_velocity]
print(env.action_space.n)

[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
[-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]
2


In [23]:
#Reference 1 for discretizizng -> https://medium.com/@tuzzer/cart-pole-balancing-with-q-learning-b54c6068d947 
#Reference 2 for discretizing -> https://github.com/deepakkavoor/cartpole-rl/blob/master/cartpole-q_learning.py
import math
env.observation_space.high[1]= 0.5
env.observation_space.low[1]= -0.5
env.observation_space.high[3]= math.radians(50)
env.observation_space.low[3]= -math.radians(50)

## Now we will implement SARSAMAX algorithm to fill the Q Table

In [24]:
import numpy as np
LEARNING_RATE = 0.05 #hyperparameter 
DISCOUNT = 0.95 #hyperparameter; how important are future rewards  
EPISODES = 50000 #hyperparameter
RENDER_EVERY = 1000
episode_rewards=[]#to store total reward obtained in each step
aggr_ep_rewards = {'ep': [], 'avg': [], 'max': [], 'min': []}#To store various stats


#Reference for discretizing=> https://mc.ai/openai-gyms-cart-pole-balancing-using-q-learning/
buckets=(1, 1, 6, 12,) #hyperparameter -> dimensions of Q Table (position,cart_velocity,pole_angle,pole_velocity)
q_table = np.zeros(buckets + (env.action_space.n,))
print(q_table.shape)



#a helper function to discretize continous states as returned by the environment 
def get_discrete_state(state):
    ratios = [(state[i] + abs(env.observation_space.low[i])) / (env.observation_space.high[i] - env.observation_space.low[i]) for i in range(len(state))]
    new_obs = [int(round((buckets[i] - 1) * ratios[i])) for i in range(len(state))]
    new_obs = [min(buckets[i] - 1, max(0, new_obs[i])) for i in range(len(state))]
    return tuple(new_obs)

(1, 1, 6, 12, 2)


In [25]:
# Exploration settings
epsilon = 1  # not a constant, qoing to be decayed
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = EPISODES//2
epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

In [26]:
for episode in range(EPISODES):
    done = False
    discrete_state = get_discrete_state(env.reset()) #we always start with initial discrete state'
    if episode % RENDER_EVERY == 0:
        render=True
        print(episode)
    else:
        render=False
    steps_per_episode = 0
    episode_reward = 0#To track each episode rewards
    while not done:
        
        if render:
            env.render()
            steps_per_episode+=1
            print('Step Count'+str(steps_per_episode))
        
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(q_table[discrete_state])
        else:
            # Get random action
            action = np.random.randint(0, env.action_space.n)
        
        state, reward, done, _ = env.step(action)# state is a tuple of (position,cart_velocity,pole_angle,pole_velocity)
        episode_reward += reward
        new_discrete_state = get_discrete_state(state)
        if not done:
            q_st_at=q_table[discrete_state+(action,)]
            q_st_plus_1_a = np.max(q_table[new_discrete_state])
            new_q_st_at = q_st_at + LEARNING_RATE * (reward + DISCOUNT * q_st_plus_1_a - q_st_at)
            q_table[discrete_state+(action,)] = new_q_st_at
        discrete_state = new_discrete_state
    
    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
    
    episode_rewards.append(episode_reward)
    if not episode % RENDER_EVERY:
        avg_reward = sum(episode_rewards[-RENDER_EVERY:])/RENDER_EVERY
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(avg_reward)
        aggr_ep_rewards['max'].append(max(episode_rewards[-RENDER_EVERY:]))
        aggr_ep_rewards['min'].append(min(episode_rewards[-RENDER_EVERY:]))
        
env.close()

0
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
1000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
2000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
3000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
Step Count24
Step Count25
Step 

13000
Step Count1
Step Count2
Step Count3
Step Count4
Step Count5
Step Count6
Step Count7
Step Count8
Step Count9
Step Count10
Step Count11
Step Count12
Step Count13
Step Count14
Step Count15
Step Count16
Step Count17
Step Count18
Step Count19
Step Count20
Step Count21
Step Count22
Step Count23
Step Count24
Step Count25
Step Count26
Step Count27
Step Count28
Step Count29
Step Count30
Step Count31
Step Count32
Step Count33
Step Count34
Step Count35
Step Count36
Step Count37
Step Count38
Step Count39
Step Count40
Step Count41
Step Count42
Step Count43
Step Count44
Step Count45
Step Count46
Step Count47
Step Count48
Step Count49
Step Count50
Step Count51
Step Count52
Step Count53
Step Count54
Step Count55
Step Count56
Step Count57
Step Count58
Step Count59
Step Count60
Step Count61
Step Count62
Step Count63
Step Count64
Step Count65
Step Count66
Step Count67
Step Count68
Step Count69
Step Count70
Step Count71
Step Count72
Step Count73
Step Count74
Step Count75
Step Count76
Step Count77
St

KeyboardInterrupt: 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt 
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], label="average rewards")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], label="max rewards")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], label="min rewards")
plt.legend(loc=4)
plt.show()

In [None]:
print(aggr_ep_rewards['ep'])
print(aggr_ep_rewards['min'])
print(aggr_ep_rewards['avg'])

## Sample Run of Agent

In [27]:

done = False
discrete_state = get_discrete_state(env.reset()) #we always start with initial discrete state'
steps=0
while not done:
    steps+=1
    print(steps)
    env.render()
        
    action = np.argmax(q_table[discrete_state])
        
    state, reward, done, _ = env.step(action)# state is a tuple of (position,cart_velocity,pole_angle,pole_velocity)

    discrete_state=get_discrete_state(state)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


# Creating Own RL Environment

In [1]:
#May require to install opencv if it is not available: conda install -c menpo opencv

#REMEMBER: Open CV used BGR image encoding format
import cv2

import numpy as np 
from PIL import Image
import pickle #to save and load python objects
import matplotlib.pyplot as plt
from matplotlib import style 
import time
import os

In [2]:
style.use('ggplot')

In [3]:
#Setting the constants
SIZE = 10 #Grid size of environment will be 10*10
EPISODES = 25000
MOVE_PENALTY = 1
ENEMY_PENALTY = 300
FOOD_REWARD = 25
epsilon = 1 #epsilon greedy policy
EPS_DECAY = 0.9998
SHOW_EVERY = 3000
start_q_table = None #can be prior q table saved using pickle library
LEARNING_RATE = 0.1
DISCOUNT = 0.95

In [4]:
PLAYER_N=1
FOOD_N=2
ENEMY_N=3
#open cv using BGR format for image representation
d={1:(255,0,0), #Player is Blue
   2:(0,255,0), #Food is Green
   3:(0,0,255)  #Enemy is Red
}

In [5]:
#Creating class for player, food and enemy
class Blob:
    def __init__(self): #Python constructor
        self.x = np.random.randint(0,SIZE)
        self.y = np.random.randint(0,SIZE)
        #To Do: There can be a issue in which enemy and/or player and/or food land on the same cell
    
    #for debugging purposes
    def __str__(self):
        return f"{self.x}, {self.y}"
    
    #operator overloading
    def __sub__(self,second):
        return (self.x - second.x, self.y-second.y)
        
    #defining possible actions by Blob agent
    def action(self,choice): #very simplified discrete action space consisting of four actions only
        if choice == 0:
            self.move(x=1,y=1)
        elif choice==1:
            self.move(x=-1,y=-1)
        elif choice ==2:
            self.move(x=-1,y=1)
        elif choice ==3:
            self.move(x=1,y=-1)
        elif choice == 4:#Move right
            self.move(x=1,y=0)
        elif choice ==5 :#Move left
            self.move(x=-1,y=0)
        elif choice ==6: #Move Up
            self.move(x=0,y=1)
        elif choice ==7: #Move Down
            self.move(x=0,y=-1)
        #To Do: Add more choices
    
    def move(self, x=False,y=False):
        #The agent will move either randomly or based on value passed in x or y
        #if not x:#x is local var and self.x is class var
        #    self.x = np.random.randint(-1,2)  
        #else:
        self.x += x
            
        #if not y:
        #    self.y = np.random.randint(-1,2)  
        #else:
        self.y += y
        
        #We have to also ensure that blob does not move outside the boundaries
        if self.x < 0:
            self.x = 0
        elif self.x > (SIZE-1):
            self.x = SIZE-1
        
        if self.y < 0:
            self.y = 0
        elif self.y > (SIZE-1):
            self.y = SIZE-1

In [6]:
#States of our Q Table consist of difference between x and y coordinates of the player and food Blob AND player and enemy Blob
if start_q_table is None:
    q_table = {} #dictionary
    for x1 in range(-SIZE+1, SIZE):
        for y1 in range(-SIZE+1, SIZE):
            for x2 in range(-SIZE+1, SIZE):
                for y2 in range(-SIZE+1,SIZE):
                    q_table[((x1,y1),(x2,y2))]= [np.random.uniform(-5,0) for i in range(8)] #since there are eight discrete actions
                    #The initial values need to be modified to see the impact
                    
else: #The Q Table exists and may be present in the form of pickle object
    with open(start_q_table,"rb") as f:
        q_table = pickle.load(f)



## Q Learning Algorithm on Custom RL Environment

In [7]:
START_EPSILON_DECAYING = 1#From which episode we want to start to decay epsilon
END_EPSILON_DECAYING = EPISODES // 2 #Till which episode we want to decay epsilon
epsilon_decay_value = epsilon / (END_EPSILON_DECAYING - START_EPSILON_DECAYING)
episode_rewards = []
for episode in range(EPISODES):
    player = Blob()
    food = Blob()
    enemy = Blob()
    
    if episode % SHOW_EVERY == 0:
        print(f'on #{episode}, epsilon: {epsilon}')
        print(f'{SHOW_EVERY} ep mean {np.mean(episode_rewards[-SHOW_EVERY: ])}')
        show = True
    else:
        show = False
    
    #updating epsilon value
    
    episode_reward = 0
    for i in range(200): #Here 200 is the steps in each episode. It is a hyperparameter
        obs = (player - food, player - enemy)#Remember the function overriding
        if np.random.random() > epsilon:
            action = np.argmax(q_table[obs])
        else:
            action = np.random.randint(0,8)
        
        player.action(action)
       
        #Deciding the reward or penalty of agent after every step in each episode
        if player.x == enemy.x and player.y == enemy.y:
            reward = -ENEMY_PENALTY
        elif player.x == food.x and player.y == food.y:
            reward = FOOD_REWARD
        else:
            reward = -MOVE_PENALTY
            
        new_obs = (player-food,player-enemy)
        max_future_q = np.max(q_table[new_obs])
        current_q = q_table[obs][action]
    
        if reward == FOOD_REWARD:
            new_q = FOOD_REWARD
            q_table[obs][action] = new_q
            episode_rewards.append(reward)
            break
        elif reward == -ENEMY_PENALTY:
            new_q = -ENEMY_PENALTY
            q_table[obs][action] = new_q
            episode_rewards.append(reward)
            break
        else:
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[obs][action] = new_q
            episode_rewards.append(reward)
        
        #Code to display the environment
        if episode>24000:
            env=np.zeros((SIZE,SIZE,3),dtype=np.uint8)
            env[food.y][food.x] = d[FOOD_N]
            env[player.y][player.x] = d[PLAYER_N]
            env[enemy.y][enemy.x] = d[ENEMY_N]            
            
            img = Image.fromarray(env,'RGB')#from PIL library
            img = img.resize((400,400))
            cv2.imshow('',np.array(img))
            cv2.waitKey(100)

        
    
    # Decaying is being done every episode if episode number is within decaying range
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value

on #0, epsilon: 1
3000 ep mean nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


on #3000, epsilon: 0.7600608048642442
3000 ep mean -1.9836666666666667
on #6000, epsilon: 0.5200416033279763
3000 ep mean -1.676
on #9000, epsilon: 0.2800224017918611
3000 ep mean -1.3553333333333333
on #12000, epsilon: 0.04000320025574697
3000 ep mean -0.935
on #15000, epsilon: -8.000640078544932e-05
3000 ep mean -0.5796666666666667
on #18000, epsilon: -8.000640078544932e-05
3000 ep mean -0.61
on #21000, epsilon: -8.000640078544932e-05
3000 ep mean -0.415
on #24000, epsilon: -8.000640078544932e-05
3000 ep mean -0.5493333333333333


KeyboardInterrupt: 

# Deep Q Network (DQN)
![alt text](https://github.com/anubhavpatrick/Advanced-Deep-Learning-Workshop/raw/master/dqn.png)

In [1]:
import numpy as np
import keras.backend.tensorflow_backend as backend
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
import tensorflow as tf
from collections import deque
import time
import random
from tqdm import tqdm
import os
from PIL import Image
import cv2


DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1_000  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
MODEL_NAME = '2x256'
MIN_REWARD = -200  # For model save
MEMORY_FRACTION = 0.20

# Environment settings
EPISODES = 5_000

# Exploration settings
epsilon = 1  # not a constant, going to be decayed
EPSILON_DECAY = 0.99975
MIN_EPSILON = 0.001

#  Stats settings
AGGREGATE_STATS_EVERY = 10  # episodes
SHOW_PREVIEW = True

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#Custom RL Environment Coding
class Blob:
    def __init__(self, size):
        self.size = size
        self.x = np.random.randint(0, size)
        self.y = np.random.randint(0, size)

    def __str__(self):
        return f"Blob ({self.x}, {self.y})"

    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)

    def __eq__(self, other):
        return self.x == other.x and self.y == other.y

    def action(self, choice):
        '''
        Gives us 9 total movement options. (0,1,2,3,4,5,6,7,8)
        '''
        if choice == 0:
            self.move(x=1, y=1)
        elif choice == 1:
            self.move(x=-1, y=-1)
        elif choice == 2:
            self.move(x=-1, y=1)
        elif choice == 3:
            self.move(x=1, y=-1)

        elif choice == 4:
            self.move(x=1, y=0)
        elif choice == 5:
            self.move(x=-1, y=0)

        elif choice == 6:
            self.move(x=0, y=1)
        elif choice == 7:
            self.move(x=0, y=-1)

        elif choice == 8:
            self.move(x=0, y=0)

    def move(self, x=False, y=False):

        # If no value for x, move randomly
        if not x:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x

        # If no value for y, move randomly
        if not y:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        # If we are out of bounds, fix!
        if self.x < 0:
            self.x = 0
        elif self.x > self.size-1:
            self.x = self.size-1
        if self.y < 0:
            self.y = 0
        elif self.y > self.size-1:
            self.y = self.size-1


class BlobEnv:
    SIZE = 10
    RETURN_IMAGES = True
    MOVE_PENALTY = 1
    ENEMY_PENALTY = 300
    FOOD_REWARD = 25
    OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3)  # 4
    ACTION_SPACE_SIZE = 9
    PLAYER_N = 1  # player key in dict
    FOOD_N = 2  # food key in dict
    ENEMY_N = 3  # enemy key in dict
    # the dict! (colors)
    d = {1: (255, 175, 0),
         2: (0, 255, 0),
         3: (0, 0, 255)}

    def reset(self):
        self.player = Blob(self.SIZE)
        self.food = Blob(self.SIZE)
        while self.food == self.player:
            self.food = Blob(self.SIZE)
        self.enemy = Blob(self.SIZE)
        while self.enemy == self.player or self.enemy == self.food:
            self.enemy = Blob(self.SIZE)

        self.episode_step = 0

        if self.RETURN_IMAGES:
            observation = np.array(self.get_image())
        else:
            observation = (self.player-self.food) + (self.player-self.enemy)
        return observation

    def step(self, action):
        self.episode_step += 1
        self.player.action(action)

        #### MAYBE ###
        #enemy.move()
        #food.move()
        ##############

        if self.RETURN_IMAGES:
            new_observation = np.array(self.get_image())
        else:
            new_observation = (self.player-self.food) + (self.player-self.enemy)

        if self.player == self.enemy:
            reward = -self.ENEMY_PENALTY
        elif self.player == self.food:
            reward = self.FOOD_REWARD
        else:
            reward = -self.MOVE_PENALTY

        done = False
        if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
            done = True

        return new_observation, reward, done

    def render(self):
        img = self.get_image()
        img = img.resize((300, 300))  # resizing so we can see our agent in all its glory.
        cv2.imshow("image", np.array(img))  # show it!
        cv2.waitKey(1)

    # FOR CNN #
    def get_image(self):
        env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)  # starts an rbg of our size
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]  # sets the food location tile to green color
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]  # sets the enemy location to red
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]  # sets the player tile to blue
        img = Image.fromarray(env, 'RGB')  # reading to rgb. Apparently. Even tho color definitions are bgr. ???
        return img


env = BlobEnv()

In [None]:
#DQN Algorithnm
# For stats
ep_rewards = [-200]

# For more repetitive results
random.seed(1)
np.random.seed(1)
tf.set_random_seed(1)

# Memory fraction, used mostly when trai8ning multiple agents
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
#backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')


# Own Tensorboard class
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.FileWriter(self.log_dir)

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        self._write_logs(stats, self.step)


# Agent class
class DQNAgent:
    def __init__(self):

        # Main model
        self.model = self.create_model()

        # Target network
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Custom tensorboard object
        self.tensorboard = ModifiedTensorBoard(log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0

    def create_model(self):
        model = Sequential()

        model.add(Conv2D(256, (3, 3), input_shape=env.OBSERVATION_SPACE_VALUES))  # OBSERVATION_SPACE_VALUES = (10, 10, 3) a 10x10 RGB image.
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.2))

        model.add(Conv2D(256, (3, 3)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.2))

        model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
        model.add(Dense(64))

        model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear'))  # ACTION_SPACE_SIZE = how many choices (9)
        model.compile(loss="mse", optimizer=Adam(lr=0.001), metrics=['accuracy'])
        return model

    # Adds step's data to a memory replay array
    # (observation space, action, reward, new observation space, done)
    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    # Trains main network every step during episode
    def train(self, terminal_state, step):

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        # Get a minibatch of random samples from memory replay table
        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        # Get current states from minibatch, then query NN model for Q values
        current_states = np.array([transition[0] for transition in minibatch])/255
        current_qs_list = self.model.predict(current_states)

        # Get future states from minibatch, then query NN model for Q values
        # When using target network, query it, otherwise main network should be queried
        new_current_states = np.array([transition[3] for transition in minibatch])/255
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        # Now we need to enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):

            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                max_future_q = np.max(future_qs_list[index])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            # Update Q value for given state
            current_qs = current_qs_list[index]
            current_qs[action] = new_q

            # And append to our training data
            X.append(current_state)
            y.append(current_qs)

        # Fit on all samples as one batch, log only on terminal state
        self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0, shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)

        # Update target network counter every episode
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, state):
        return self.model.predict(np.array(state).reshape(-1, *state.shape)/255)[0]


agent = DQNAgent()

# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    # Update tensorboard step every episode
    agent.tensorboard.step = episode

    # Restarting episode - reset episode reward and step number
    episode_reward = 0
    step = 1

    # Reset environment and get initial state
    current_state = env.reset()

    # Reset flag and start iterating until episode ends
    done = False
    while not done:

        # This part stays mostly the same, the change is to query a model for Q values
        if np.random.random() > epsilon:
            # Get action from Q table
            action = np.argmax(agent.get_qs(current_state))
        else:
            # Get random action
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)

        # Transform new continous state to new discrete state and count reward
        episode_reward += reward

        if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
            env.render()

        # Every step we update replay memory and train main network
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)

        current_state = new_state
        step += 1

    # Append episode reward to a list and log stats (every given number of episodes)
    ep_rewards.append(episode_reward)
    if not episode % AGGREGATE_STATS_EVERY or episode == 1:
        average_reward = sum(ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
        min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
        max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
        agent.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)

        # Save model, but only when min reward is greater or equal a set value
        if min_reward >= MIN_REWARD:
            agent.model.save(f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.




  1%|          | 45/5000 [09:46<41:10:48, 29.92s/episodes]