# Reinforcement Learning 
#### Alvee Hoque 07/06/21

This is a demo project attempting to explore the OpenAI's gym with the MountainCar-v0 game. Will be using Deep Q-Learning

### Imports

In [1]:
import gym
import numpy as np

### Environment

In [2]:
env = gym.make('MountainCar-v0')
env.reset()

print('Available Actions:',env.action_space.n)
print('High:',env.observation_space.high)
print('Low: ', env.observation_space.low)

Available Actions: 3
High: [0.6  0.07]
Low:  [-1.2  -0.07]


In [3]:
state = env.reset()

done = False
while not done: 
    action = 2 
    new_state, reward, done, _ = env.step(action)
    print(reward, new_state)

-1.0 [-0.45670166  0.00050535]
-1.0 [-0.45569468  0.00100698]
-1.0 [-0.45419346  0.00150121]
-1.0 [-0.45220904  0.00198443]
-1.0 [-0.44975595  0.00245309]
-1.0 [-0.44685216  0.00290379]
-1.0 [-0.4435189   0.00333326]
-1.0 [-0.43978048  0.00373842]
-1.0 [-0.4356641   0.00411639]
-1.0 [-0.4311996  0.0044645]
-1.0 [-0.42641925  0.00478035]
-1.0 [-0.42135747  0.00506178]
-1.0 [-0.41605053  0.00530694]
-1.0 [-0.41053628  0.00551425]
-1.0 [-0.40485384  0.00568245]
-1.0 [-0.39904326  0.00581058]
-1.0 [-0.39314526  0.005898  ]
-1.0 [-0.38720088  0.00594438]
-1.0 [-0.38125117  0.00594971]
-1.0 [-0.37533692  0.00591425]
-1.0 [-0.36949832  0.00583859]
-1.0 [-0.36377475  0.00572357]
-1.0 [-0.35820446  0.00557029]
-1.0 [-0.35282435  0.00538011]
-1.0 [-0.34766974  0.0051546 ]
-1.0 [-0.34277419  0.00489555]
-1.0 [-0.33816929  0.00460491]
-1.0 [-0.33388447  0.00428481]
-1.0 [-0.32994694  0.00393754]
-1.0 [-0.32638146  0.00356548]
-1.0 [-0.32321031  0.00317115]
-1.0 [-0.32045317  0.00275714]
-1.0 [-0.3

#### Notes: 
The reward is always -1.0 and the new state values are position and velocity. 

In [4]:
DISCRETE_OS_SIZE = [20,20]
discrete_os_win_size = (env.observation_space.high - env.observation_space.low)/DISCRETE_OS_SIZE
print(discrete_os_win_size)

[0.09  0.007]


In [5]:
# Making q table
q_table = np.random.uniform(low=-2, high=0, 
                            size=(DISCRETE_OS_SIZE + [env.action_space.n]))


In [6]:
# new q states formula: 
#new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)


In [7]:
# Q-Learning Settings: 
learning_rate = 0.1
discount = 0.95
episodes = 2000

# Exploration setting
epsilon = 1
start_epsilon_decay = 1 
end_epsilon_decay = episodes//2
epsilon_decay_value = epsilon/(end_epsilon_decay - start_epsilon_decay)


show_every = 500

In [8]:
def get_discrete_state(state):
    discrete_state = (state -env.observation_space.low)/discrete_os_win_size
    return tuple(discrete_state.astype(int))

In [9]:
for episode in range(episodes):
    discrete_state = get_discrete_state(env.reset())
    done = False
    
    if episode % show_every == 0:
        render = True
        print(episode)
        
    else: 
        render = False
    
    while not done: 
        action = np.argmax(q_table[discrete_state])
        new_state, reward, done, _ = env.step(action)
        new_discrete_state = get_discrete_state(new_state)
        
        if episode % show_every == 0: 
            
            env.render()

        if not done:
            # max Q val for next step
            max_future_q = np.max(q_table[new_discrete_state])

            # current Q val
            current_q = q_table[discrete_state + (action,)]

            # new Q val 
            new_q = (1-learning_rate) * current_q + learning_rate * (reward + discount * max_future_q)

            # updating Q table
            q_table[discrete_state + (action,)] = new_q

        elif new_state[0] >= env.goal_position:
            q_table[discrete_state + (action,)] = 0 

        discrete_state = new_discrete_state
    
env.close()

0
500
1000
1500
