# Q-learning 

In [None]:
#!pip install cmake 'gym[atari]' scipy

from collections import defaultdict
import pickle
import random
import click
import gym

In [None]:
def select_optimal_action(q_table, state, action_space):
    max_q_value_action = None
    max_q_value = 0

    if q_table[state]:
        for action, action_q_value in q_table[state].items():
            if action_q_value >= max_q_value:
                max_q_value = action_q_value
                max_q_value_action = action

    return max_q_value_action if max_q_value_action else action_space.sample()

## Train

In [None]:
# The hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

NUM_EPISODES = 100000


def update(q_table, env, state):
    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = select_optimal_action(q_table, state, env.action_space)

    next_state, reward, _, _ = env.step(action)
    old_q_value = q_table[state][action]

    # Check if next_state has q values already
    if not q_table[next_state]:
        q_table[next_state] = {action: 0 for action in range(env.action_space.n)}

    # Maximum q_value for the actions in next state
    next_max = max(q_table[next_state].values())

    # Calculate the new q_value
    new_q_value = (1 - alpha) * old_q_value + alpha * (reward + gamma * next_max)

    # Finally, update the q_value
    q_table[state][action] = new_q_value

    return next_state, reward


def train_agent(q_table, env, num_episodes):
    for i in range(num_episodes):
        state = env.reset()
        if not q_table[state]:
            q_table[state] = {
                action: 0 for action in range(env.action_space.n)}

        epochs = 0
        num_penalties, reward, total_reward = 0, 0, 0
        while reward != 20:
            state, reward = update(q_table, env, state)
            total_reward += reward

            if reward == -10:
                num_penalties += 1

            epochs += 1
        print("\nTraining episode {}".format(i + 1))
        print("Time steps: {}, Penalties: {}, Reward: {}".format(epochs,
                                                                 num_penalties,
                                                                 total_reward))

    print("Training finished.\n")

    return q_table

In [None]:
env = gym.make("Taxi-v3")
q_table = defaultdict(int, {})
q_table = train_agent(q_table, env, NUM_EPISODES)

# save the table for future use
with open("q_table.pickle", "wb") as f:
    pickle.dump(dict(q_table), f)

## Evaluation

In [None]:
NUM_EPISODES = 100


def evaluate_agent(q_table, env, num_trials):
    total_epochs, total_penalties = 0, 0

    print("Running episodes...")
    for _ in range(num_trials):
        state = env.reset()
        epochs, num_penalties, reward = 0, 0, 0

        while reward != 20:
            next_action = select_optimal_action(q_table,
                                                state,
                                                env.action_space)
            state, reward, _, _ = env.step(next_action)

            if reward == -10:
                num_penalties += 1

            epochs += 1

        total_penalties += num_penalties
        total_epochs += epochs

    average_time = total_epochs / float(num_trials)
    average_penalties = total_penalties / float(num_trials)
    print("Evaluation results after {} trials".format(num_trials))
    print("Average time steps taken: {}".format(average_time))
    print("Average number of penalties incurred: {}".format(average_penalties))


env = gym.make("Taxi-v3")
with open("q_table.pickle", 'rb') as f:
    q_table = pickle.load(f)
evaluate_agent(q_table, env, NUM_EPISODES)