In [243]:
import gym
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [1081]:
min_epsilon = 0.01
max_epsilon = 1.0
decay_rate = 0.005
num_episodes = 25000
max_steps = 100
alpha=0.8
gamma=.7

In [1077]:
class QLearningAgent:
  def __init__(self, env, alpha, gamma):
    self.env = env
    self.alpha = alpha
    self.gamma = gamma
    self.Q = np.zeros([env.observation_space.n, env.action_space.n])



  def epsilon_greedy(self, state, epsilon):
    if np.random.uniform(0,1) < epsilon:
      return env.action_space.sample()
    else:
      return np.argmax(self.Q[state][:])


  def train(self, num_episodes):
    rewards = []
    epsilon = 1.0

    for episode in range(num_episodes):
      total_reward = 0
      state = env.reset()
      done = False

      for i in range(max_steps):
        action = self.epsilon_greedy(state, epsilon)
        next_state, reward, done, _ = self.env.step(action)
        self.Q[state][action] += alpha * (reward + gamma * np.max(self.Q[next_state][:]) - self.Q[state][action])

        if done:
          epsilon = min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*episode)
          break
        state = next_state



    return self.Q

  def evaluate(self, Q, num_episodes):
    total_rewards = np.zeros(num_episodes)
    for episode in range(num_episodes):
      state = env.reset()
      episode_reward = 0

      for i in range(max_steps):
        action = np.argmax(Q[state][:])
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        state = next_state
        if done:
          total_rewards[episode] = episode_reward
          break
    return total_rewards



env = gym.make('FrozenLake-v1')
agent = QLearningAgent(env, alpha=0.8, gamma=.7)
Q = agent.train(num_episodes)

In [1080]:
tot_reward = agent.evaluate(Q, 100)
avg_reward = round(np.mean(tot_reward), 4)
print(f"Average reward: {avg_reward}")

Average reward: 0.7
