## **Module of functions that can use multiple environments**

In [5]:
!pip install cmake 'gym[atari]' scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Importing important Packages

In [46]:
import gym 
import random
from IPython.display import clear_output
import numpy as np
from IPython.display import clear_output
from time import sleep
import itertools

Function to learn the agent

In [84]:
def learning(the_game, alpha, gamma, epsilon, itritions = 100001):
  #picking the environment 
  env = gym.make(the_game).env
  #reset environment to a new, random state
  env.reset()
  #Viewing the environment
  env.render()

  # Initialize the q table
  q_table = np.zeros([env.observation_space.n, env.action_space.n])

  # For plotting metrics
  all_epochs = []
  all_penalties = []

  for i in range(0, itritions):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False

    while not done and epochs < 1000 :
          if random.uniform(0, 1) < epsilon:
              action = env.action_space.sample() # Explore action space
          else:
              action = np.argmax(q_table[state]) # Exploit learned values

          next_state, reward, done, info = env.step(action) 
          
          old_value = q_table[state, action]
          next_max = np.max(q_table[next_state])
      
          new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
          q_table[state, action] = new_value

          if reward == -10:
              penalties += 1

          state = next_state
          epochs += 1
          
    if i % 5000 == 0:
        alpha = alpha * (1-0.005)
        gamma = gamma * (1-0.05)
        epsilon = epsilon * (1-0.05)
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

  print("Training finished.\n")



  return  env , q_table

Function to evaluate agent's performance after Q-learning

In [92]:
def evaluating (env , q_table, episodes):  
  total_epochs, total_penalties,total_rewards = 0, 0,0

  for _ in range(episodes):
      state = env.reset()
      epochs, penalties, reward = 0, 0, 0
      
      done = False
      
      while not done and epochs < 1000:
          action = np.argmax(q_table[state])
          state, reward, done, info = env.step(action)

          if reward == -10:
              penalties += 1

          epochs += 1

      total_penalties += penalties
      total_epochs += epochs
      total_rewards += reward
  print(f"Results after {episodes} episodes:")
  print(f"Average timesteps per episode: {total_epochs / episodes}")
  print(f"Average penalties per episode: {total_penalties / episodes}")
  return total_epochs,total_rewards

In [43]:
envi , q_tab = learning("Taxi-v3", 0.9,1,1,50001)

Episode: 50000
Training finished.



In [76]:
tepochs,trewards = evaluating(envi , q_tab, 1000)

Results after 1000 episodes:
Average timesteps per episode: 13.134
Average penalties per episode: 0.0


## **Grid search**

Grid search function to discover the best hyperparameters

In [96]:
def grid_search (the_game, itritions ,episodes):
  alpha_range = list(np.arange(0, 1, 0.5))
  gama_range = list(np.arange(0, 1, 0.5))
  epsilon_range = list(np.arange(0, 1, 0.5))
  ls = list(itertools.product(alpha_range,gama_range,epsilon_range))
  all = []
  for i in ls:
    environment , q_table= learning(the_game, i[0], i[1], i[2],itritions)
    tepochs,trewards = evaluating (environment , q_table, episodes)
    acc = trewards/tepochs
    all.append({"atpe": acc, "paramters": (i[0], i[1], i[2])})
  
  return all

In [97]:
best = grid_search("Taxi-v3",50001,1000)

Episode: 50000
Training finished.

Results after 1000 episodes:
Average timesteps per episode: 112.98
Average penalties per episode: 0.0
