In [32]:
# (a)
import gymnasium as gym

# Create FrozenLake-v1 environment with a 4x4 grid, slippery=True by default
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True, render_mode=None)
print("Environment created.")

Environment created.


In [41]:
#(b)
import numpy as np
import pandas as pd

# Initialize variables
num_episodes = 10000
data = []


def get_goal_proximity(state):
    row, col = divmod(state, 4)
    goal_row, goal_col = 3, 3
    return abs(goal_row - row) + abs(goal_col - col)

# Collect data
for episode in range(num_episodes):
    state, _ = env.reset()
    done = False
    episode_data = []
    total_reward = 0

    while not done:
        action = env.action_space.sample()
        next_state, reward, done, truncated, info = env.step(action)

        # Collect step data
        episode_data.append({
            "State": state,
            "Action": action,
            "Reward": reward,
            "GoalProximity": get_goal_proximity(state)
        })

        state = next_state
        total_reward += reward

    # Add total reward to each step in the episode
    for entry in episode_data:
        entry["TotalReward"] = total_reward
        data.append(entry)



# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Show first few rows
df.head()


Unnamed: 0,State,Action,Reward,GoalProximity,TotalReward
0,0,2,0.0,6,0.0
1,1,3,0.0,5,0.0
2,1,1,0.0,5,0.0
3,0,2,0.0,6,0.0
4,0,2,0.0,6,0.0


In [42]:
#(c)
# Total reward is 1 only when the goal is reached
success_episodes = sum(1 for d in data if d["TotalReward"] == 1)
print(f"Proportion of successful episodes: {success_episodes / num_episodes:.4f}")


Proportion of successful episodes: 0.1734


In [43]:
# (d)
from collections import defaultdict

# Count successful and total (s, a) occurrences
sa_counts = defaultdict(int)
sa_successes = defaultdict(int)

for entry in data:
    key = (entry['State'], entry['Action'])
    sa_counts[key] += 1
    if entry['TotalReward'] == 1:
        sa_successes[key] += 1

# Calculate importance = success rate per (state, action)
importance = {
    key: sa_successes[key] / sa_counts[key]
    for key in sa_counts
}

# Display a few examples
for k, v in list(importance.items())[:10]:
    print(f"State {k[0]}, Action {k[1]}: Importance = {v:.4f}")


State 0, Action 2: Importance = 0.0138
State 1, Action 3: Importance = 0.0174
State 1, Action 1: Importance = 0.0144
State 2, Action 2: Importance = 0.0303
State 2, Action 0: Importance = 0.0237
State 2, Action 1: Importance = 0.0197
State 3, Action 2: Importance = 0.0083
State 4, Action 3: Importance = 0.0092
State 0, Action 0: Importance = 0.0163
State 4, Action 0: Importance = 0.0191


In [44]:
#(e)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Prepare features and target
X = [(row['State'], row['Action']) for row in data]
y = [row['TotalReward'] for row in data]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# Test accuracy (R² score)
print("Model R² score on test set:", model.score(X_test, y_test))


Model R² score on test set: 0.09265560993967492


In [45]:
#(f)

# Precompute predicted Q-values for all (state, action) pairs
q_values = {}
for state in range(env.observation_space.n):
    for action in range(env.action_space.n):
        q_values[(state, action)] = model.predict([[state, action]])[0]

# Define policy using precomputed values
def model_policy(state):
    values = [q_values[(state, a)] for a in range(env.action_space.n)]
    return int(np.argmax(values))

# Run 1000 episodes with fast policy
successes_f = 0
episodes_f = 1000

for _ in range(episodes_f):
    state, _ = env.reset()
    done = False
    while not done:
        action = model_policy(state)
        state, reward, done, truncated, info = env.step(action)
        if done and reward == 1:
            successes_f += 1

success_rate_f = successes_f / episodes_f
print(f"Success rate (1000 episodes): {success_rate_f:.4f}")


Success rate (1000 episodes): 0.2060


In [46]:
#(g)

successes_g = 0
episodes_g = 10000

for _ in range(episodes_g):
    state, _ = env.reset()
    done = False
    while not done:
        action = model_policy(state)  # uses precomputed Q-values
        state, reward, done, truncated, info = env.step(action)
        if done and reward == 1:
            successes_g += 1

success_rate_g = successes_g / episodes_g
print(f"Success rate (10,000 episodes): {success_rate_g:.4f}")


Success rate (10,000 episodes): 0.1965


In [47]:
#(h)

import random

def epsilon_greedy_policy(state, epsilon=0.1):
    if random.random() < epsilon:
        return env.action_space.sample()  # explore
    else:
        return model_policy(state)   # exploit (precomputed Q-values)

successes_eps = 0
episodes_eps = 10000
epsilon = 0.1

for _ in range(episodes_eps):
    state, _ = env.reset()
    done = False
    while not done:
        action = epsilon_greedy_policy(state, epsilon)
        state, reward, done, truncated, info = env.step(action)
        if done and reward == 1:
            successes_eps += 1

success_rate_eps = successes_eps / episodes_eps
print(f"[ε-greedy] Success rate (10,000 episodes): {success_rate_eps:.4f}")


[ε-greedy] Success rate (10,000 episodes): 0.1566


In [48]:
# Cell (i) - Compare model-guided vs ε-greedy performance (10,000 episodes each)

print("Performance Comparison on 10,000 Episodes")
print("-----------------------------------------")
print(f"Model-guided agent (greedy, part f):     {success_rate_g:.4f}")
print(f"Improved agent (ε-greedy, part h):       {success_rate_eps:.4f}")

if success_rate_eps > success_rate_g:
    print("\n ε-greedy strategy improved performance by exploring alternative paths.")
elif success_rate_eps < success_rate_g:
    print("\n Model-guided agent performed better; exploration may have been suboptimal.")
else:
    print("\n Both strategies performed equally. Try tuning ε or using more advanced models.")


Performance Comparison on 10,000 Episodes
-----------------------------------------
Model-guided agent (greedy, part f):     0.1965
Improved agent (ε-greedy, part h):       0.1566

 Model-guided agent performed better; exploration may have been suboptimal.
