In [1]:
!pip install gym

Defaulting to user installation because normal site-packages is not writeable
Collecting gym
  Downloading gym-0.26.2.tar.gz (721 kB)
     ---------------------------------------- 0.0/721.7 kB ? eta -:--:--
     - ------------------------------------- 30.7/721.7 kB 1.3 MB/s eta 0:00:01
     - ------------------------------------- 30.7/721.7 kB 1.3 MB/s eta 0:00:01
     ---- -------------------------------- 92.2/721.7 kB 751.6 kB/s eta 0:00:01
     ---- -------------------------------- 92.2/721.7 kB 751.6 kB/s eta 0:00:01
     -------- --------------------------- 163.8/721.7 kB 701.4 kB/s eta 0:00:01
     -------- --------------------------- 163.8/721.7 kB 701.4 kB/s eta 0:00:01
     ---------- ------------------------- 204.8/721.7 kB 623.6 kB/s eta 0:00:01
     ----------- ------------------------ 225.3/721.7 kB 599.0 kB/s eta 0:00:01
     -------------- --------------------- 286.7/721.7 kB 655.2 kB/s eta 0:00:01
     --------------- -------------------- 307.2/721.7 kB 633.2 kB/s eta 0

In [6]:
import gym

env = gym.make('FrozenLake-v1', is_slippery=False)

action_space = 4
observation_space = 16

print("Action Space: ", env.action_space)
print("State Space: ", env.observation_space)

Action Space:  Discrete(4)
State Space:  Discrete(16)


In [7]:
import numpy as np
Q_table = np.zeros([env.observation_space.n, env.action_space.n])

alpha = 0.1
gamma = 0.99
epsilon = 1
epsilon_decay = 0.995
min_epsilon = 0.01
episodes = 10000
max_steps = 100

In [8]:
def choose_action(state, Q_table, epsilon):
    if np.random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(Q_table[state])
    return action

def update_q_table(Q_table, state, action, reward, next_state, alpha, gamma):
    best_next_action = np.argmax(Q_table[next_state])
    Q_table[state, action] = Q_table[state, action] + alpha * (reward + gamma * Q_table[next_state, best_next_action] - Q_table[state, action])

In [9]:
rewards = []

for episode in range(episodes):
    state = env.reset()[0]  # Reset environment at the beginning of each episode
    total_reward = 0
    done = False
    
    for step in range(max_steps):
        action = choose_action(state, Q_table, epsilon)
        next_state, reward, done, _, _ = env.step(action)
        
        update_q_table(Q_table, state, action, reward, next_state, alpha, gamma)
        
        state = next_state
        total_reward += reward
        
        if done:
            break
    
    rewards.append(total_reward)
    epsilon = max(min_epsilon, epsilon * epsilon_decay)

print("Training finished.")


  if not isinstance(terminated, (bool, np.bool8)):


Training finished.


In [None]:
# Test the agent's performance
def evaluate_agent(Q_table, episodes=100):
    total_rewards = 0
    
    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0
        
        while not done:
            action = np.argmax(Q_table[state])  # Always choose the action with the highest Q-value
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
        
        total_rewards += total_reward
    
    avg_reward = total_rewards / episodes
    return avg_reward

avg_reward = evaluate_agent(Q_table)
print(f"Average reward after training: {avg_reward}")


In [None]:
# Example of experimenting with different values
# Set different values for alpha, gamma, or epsilon and compare average rewards
alpha_values = [0.1, 0.5, 0.9]
gamma_values = [0.9, 0.95, 0.99]
epsilon_values = [1.0, 0.5, 0.1]

for alpha in alpha_values:
    for gamma in gamma_values:
        for epsilon in epsilon_values:
            # Repeat the training and evaluation process for different hyperparameters
            # Compare average rewards or performance metrics
            pass


In [None]:
# Random policy (baseline method)
def random_agent(episodes=100):
    total_rewards = 0
    
    for episode in range(episodes):
        state = env.reset()[0]
        done = False
        total_reward = 0
        
        while not done:
            action = env.action_space.sample()  # Take random action
            next_state, reward, done, _, _ = env.step(action)
            total_reward += reward
            state = next_state
        
        total_rewards += total_reward
    
    avg_reward = total_rewards / episodes
    return avg_reward

random_reward = random_agent()
print(f"Average reward of random agent: {random_reward}")
