Task 1: Tabular Q-Learning Update


In [1]:
import numpy as  np
import random
import matplotlib.pyplot as plt

def init_q_table(n_states, n_actions):
    q_table = np.zeros((n_states, n_actions))
    return q_table

#print(init_q_table(2,2))

ModuleNotFoundError: No module named 'numpy'

In [2]:
def q_update(Q,s,a,r,s_next,alpha,gamma):
    current_q = Q[s,a]
    a_ = np.argmax(Q[s_next])
    max_future_q = Q[s_next,a_]
    
    new_current_q = current_q + alpha*(r + gamma*max_future_q - current_q)
    
    Q[s,a] = new_current_q
    
    return Q

Task 2: ε-Greedy Policy on a Custom GridWorld


In [3]:
class gridWorld:
    def __init__(self, size=4):
        self.size = size
        self.start_state = (0,0)
        self.terminal_state = (3,3)
        self.state = self.start_state
        
        self.actions = {0 : (0,1), 1 : (0,-1), 2 : (-1,0), 3 : (1,0)}
        
    def reset(self):
        self.state = self.start_state
        return self.state
    
    def step(self, action):
        move = self.actions[action]
        new_state = (self.state[0] + move[0], self.state[1] + move[1])
        
        new_state = (
            max(0, min(self.size-1, self.state[0])),
            max(0, min(self.size-1, self.state[1]))
        )
        
        self.state = new_state
        
        if self.state == self.terminal_state:
            return self.state, 1.0, True
        else:
            return self.state, 0.0, False
        
    def get_state_nos(self):
        return self.state[0]*self.size + self.state[1]
    
    def get_states(self):
        return [(i, j) for i in range(self.size) for j in range(self.size)] 

In [4]:
def select_action(Q, state, epsilon):
    if random.random() < epsilon:
        action = random.randint(0,3)
    else:
        action = np.argmax(Q[state])
        
    return action  

In [5]:

# traning in the gridWorld  

env = gridWorld()

episodes = 500
alpha = 0.1
gamma = 0.95

n_state = len(env.get_states())
#print(len(n_state)) 
n_actions = len(env.actions)
#print(len(n_actions))

epsilons = [0.1, 0.2]
result = {}

for epsilon in epsilons:
    Q = init_q_table(n_state, n_actions)
    total_rewards = []
    avg_rewards = []
    for episode in range(episodes):
        #print(episode)
        state = env.reset()
        episode_reward = 0
    
        done = False
        while not done:
            state_idx = env.get_state_nos()
            action = select_action(Q, state_idx, epsilon)
            next_state, reward, done = env.step(action)
            next_state_idx = env.get_state_nos()
            episode_reward += reward
            if not done:
                q_update(Q, state_idx, action, reward, next_state_idx, alpha, gamma)
            elif next_state == (3,3):
                print(f"target reached on episode : {episode}")    
            
        total_rewards.append(episode_reward)   
    
        if episode >= 49:
            average_reward = np.mean(total_rewards[-50:])
            avg_rewards.append(average_reward)
            
    result[epsilon] = avg_rewards        

%matplotlib inline
for epsilon, rewards in result.items():
    plt.plot(rewards, label=f"ε = {epsilon}") 

plt.title("Moving Average Reward over Episodes")
plt.xlabel("Episode")
plt.ylabel("Average Reward (last 50 episodes)")
plt.legend()       
plt.show()   

NameError: name 'init_q_table' is not defined

Task 4: Deep Q-Network with Target Copy


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DQNNetwork(nn.Module):
    def __init__(self, obs_dim, n_actions):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(obs_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.output = nn.Linear(128, n_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.output(x)

# Defining policy and Target classes
class DQNPolicy(DQNNetwork):
    pass

class DQNTarget(DQNNetwork):
    pass

In [None]:
def update_target(policy_net, target_net):
    target_net.load_state_dict(policy_net.state_dict())

Task 5: Full DQN Training Loop on CartPole-v1

In [None]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import matplotlib.pyplot as plt

# Environment setup
env = gym.make("CartPole-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Initialising networks
policy_net = DQN(state_dim, action_dim)
target_net = DQN(state_dim, action_dim)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# Optimizer and loss
optimizer = optim.Adam(policy_net.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# Hyperparameters
num_episodes = 500
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 1 / 500
target_update_freq = 10

# Target network update function
def update_target(policy_net, target_net):
    target_net.load_state_dict(policy_net.state_dict())

# ε-greedy action selection
def select_action(state, epsilon):
    if random.random() < epsilon:
        return random.randint(0, action_dim - 1)
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = policy_net(state_tensor)
        return torch.argmax(q_values).item()

In [None]:
# Training loop
rewards = []

for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = select_action(state, epsilon)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        with torch.no_grad():
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            max_next_q = target_net(next_state_tensor).max(1)[0].item()
            target = reward + (0 if done else gamma * max_next_q)

        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = policy_net(state_tensor)
        q_value = q_values[0, action]

        loss = loss_fn(q_value, torch.tensor(target))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        state = next_state
        epsilon = max(epsilon_min, epsilon - epsilon_decay)

    rewards.append(total_reward)

    if (episode + 1) % target_update_freq == 0:
        update_target(policy_net, target_net)

    if (episode + 1) % 10 == 0:
        avg = np.mean(rewards[-10:])
        print(f"Episode {episode+1}, Avg Reward: {avg:.2f}, Epsilon: {epsilon:.3f}")

I will be really honest while giving explanation to the solution to the fifth question.
In the week 1 resourses the DQN was implemented using tensorflow and keras and use of Pytorch to implment DQN was not there so, while doing this task I had taken help from youtube videos and chatgpt . Like I just asked gpt to give me a basic structure of code to implement DQN using Pytorch and I asked for the whole explanation of the code. and from that help I had written the code .
In the rest all tasks and I had submitted my own written code.