# Q-Learning

In [1]:
import gym
import random
import numpy as np
from collections import namedtuple, deque

### Untrained Agent without Reinforcement Learning  
Here, the agent plays by sampling random actions from it's action space. It obviously does terribly.  

In [3]:
env = gym.make('Taxi-v3')
print(env.observation_space)
print(env.action_space.n)

Discrete(500)
6


The state and action space both are discrete for this game environment. 

In [4]:
for i in range(10):
    state = env.reset()
    score, penalties = 0, 0
    done = False
    while not done:
#         env.render()
        action = env.action_space.sample()
        state_, reward, done, _ = env.step(action)
        score += reward
        if reward == -10:
            penalties += 1
        state = state_
    print(f'Episode {i} | Score: {score} | Penalties: {penalties}')
env.close()

Episode 0 | Score: -355 | Penalties: 31
Episode 1 | Score: -704 | Penalties: 56
Episode 2 | Score: -704 | Penalties: 56
Episode 3 | Score: -785 | Penalties: 65
Episode 4 | Score: -866 | Penalties: 74
Episode 5 | Score: -758 | Penalties: 62
Episode 6 | Score: -545 | Penalties: 45
Episode 7 | Score: -758 | Penalties: 62
Episode 8 | Score: -794 | Penalties: 66
Episode 9 | Score: -713 | Penalties: 57


## Building the Q-Learning model

In [9]:
class Agent:
    def __init__(self, env, epsilon, alpha, gamma):
        self.env = env
        self.Q_table = np.zeros((env.observation_space.n, env.action_space.n))
        self.epsilon = epsilon
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "state_"])
        self.alpha = alpha
        self.gamma = gamma
        
    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(self.Q_table[state])
        return action
    
    def learn(self, xp):
        state, action, reward, state_ = xp
        next_qmax = np.max(self.Q_table[state_])
        self.Q_table[state, action] = (1 - self.alpha) * self.Q_table[state, action] \
                                                + self.alpha * (reward + self.gamma * next_qmax)

## Training the model

In [78]:
%%time
env = gym.make('Taxi-v3')
agent = Agent(env, epsilon=0.1, alpha=0.1, gamma=0.6)
scores, timesteps, total_penalties = [], [], []

for i in range(50000):
    state = env.reset()
    score = 0
    done = False
    t, penalties = 0, 0
    while not done:
#         env.render()
        action = agent.choose_action(state)
        state_, reward, done, _ = env.step(action)
        score += reward
        xp = agent.experience(state, action, reward, state_)
        agent.learn(xp)
        state = state_
        if reward == -10:
            penalties += 1
        t += 1
    timesteps.append(t)
    total_penalties.append(penalties)
    if (i+1) % 1000 == 0:
        print(f'Episode {i+1} | Score: {score} | Avg Timesteps: {np.mean(timesteps[-100:])}',
              f'| Avg Penalties: {np.mean(total_penalties[-100:])}')
env.close()

Episode 1000 | Score: -11 | Avg Timesteps: 46.5 | Avg Penalties: 1.61
Episode 2000 | Score: -67 | Avg Timesteps: 20.72 | Avg Penalties: 0.62
Episode 3000 | Score: 2 | Avg Timesteps: 17.04 | Avg Penalties: 0.56
Episode 4000 | Score: 11 | Avg Timesteps: 17.49 | Avg Penalties: 0.55
Episode 5000 | Score: 11 | Avg Timesteps: 15.51 | Avg Penalties: 0.45
Episode 6000 | Score: -7 | Avg Timesteps: 14.74 | Avg Penalties: 0.47
Episode 7000 | Score: 7 | Avg Timesteps: 15.75 | Avg Penalties: 0.39
Episode 8000 | Score: 8 | Avg Timesteps: 15.3 | Avg Penalties: 0.48
Episode 9000 | Score: -4 | Avg Timesteps: 14.64 | Avg Penalties: 0.33
Episode 10000 | Score: -1 | Avg Timesteps: 15.27 | Avg Penalties: 0.49
Episode 11000 | Score: -10 | Avg Timesteps: 15.37 | Avg Penalties: 0.51
Episode 12000 | Score: 11 | Avg Timesteps: 14.68 | Avg Penalties: 0.43
Episode 13000 | Score: 5 | Avg Timesteps: 14.86 | Avg Penalties: 0.47
Episode 14000 | Score: 4 | Avg Timesteps: 14.69 | Avg Penalties: 0.52
Episode 15000 | Sco

## Testing the model

In [92]:
total_penalties, scores, timesteps = [], [], []
for i in range(100):
    state = env.reset()
    score, penalties, t = 0, 0, 0
    done = False
    while not done:
#         env.render()
        action = np.argmax(agent.Q_table[state])
        state_, reward, done, _ = env.step(action)
        score += reward
        if reward == -10:
            penalties += 1
        state = state_
        t += 1
    scores.append(score)
    total_penalties.append(penalties)
    timesteps.append(t)
print(f'Avg Timesteps: {np.mean(timesteps)} | Avg Penalties: {np.mean(total_penalties)}')
env.close()

Avg Timesteps: 12.98 | Avg Penalties: 0.0


The Q-learning model performs really well - now the agent incurs 0 penalties and takes minimum steps to drop the passenger off to the right location!