<a href="https://colab.research.google.com/github/ayushisingh-14/RLGameMaster-Snake-RL/blob/main/RLGameMaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium torch numpy matplotlib pygame



In [None]:
!mkdir env agent model utils

In [None]:
!mkdir env

mkdir: cannot create directory ‘env’: File exists


In [None]:
!ls

agent  env  model  sample_data	utils


In [None]:
!touch env/snake_env.py

In [None]:
!ls env

snake_env.py


In [None]:
!mkdir -p model

In [None]:
!touch model/dqn_model.py

In [None]:
!ls model

dqn_model.py


In [None]:
!pwd
!ls

/content
agent  env  model  sample_data	utils


In [None]:
!mkdir utils

mkdir: cannot create directory ‘utils’: File exists


In [None]:
!ls

agent  env  model  sample_data	utils


In [19]:
!ls

agent  env  model  sample_data	utils


In [20]:
env  model  utils  RLGameMaster.ipynb

env: model=utils  RLGameMaster.ipynb


In [21]:
!mkdir agent

mkdir: cannot create directory ‘agent’: File exists


In [22]:
!ls

agent  env  model  sample_data	utils


In [24]:
%%writefile agent/dqn_agent.py
import random
import torch
import torch.nn as nn
import torch.optim as optim

from model.dqn_model import DQN
from utils.replay_buffer import ReplayBuffer


class DQNAgent:
    def __init__(
        self,
        state_shape=(1, 10, 10),
        num_actions=4,
        lr=0.001,
        gamma=0.99,
        epsilon=1.0,
        epsilon_min=0.01,
        epsilon_decay=0.995,
        buffer_size=10000,
        batch_size=64,
        device="cpu"
    ):
        self.num_actions = num_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size
        self.device = device

        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

        self.replay_buffer = ReplayBuffer(buffer_size)

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1)

        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.policy_net(state)
        return torch.argmax(q_values).item()

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions).unsqueeze(1).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        current_q = self.policy_net(states).gather(1, actions).squeeze()
        next_q = self.target_net(next_states).max(1)[0]
        target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = self.loss_fn(current_q, target_q.detach())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

Writing agent/dqn_agent.py


In [25]:
%%writefile utils/replay_buffer.py
import random
from collections import deque

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

Writing utils/replay_buffer.py


In [26]:
!ls

agent  env  model  sample_data	utils


In [28]:
%%writefile train.py
import sys
sys.path.append(".")

import torch
import matplotlib.pyplot as plt

from env.snake_env import SnakeEnv
from agent.dqn_agent import DQNAgent


def train():
    env = SnakeEnv()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    agent = DQNAgent(device=device)

    num_episodes = 300
    target_update_freq = 20

    rewards_per_episode = []

    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)

            agent.store_transition(state, action, reward, next_state, done)
            agent.learn()

            state = next_state
            total_reward += reward

        rewards_per_episode.append(total_reward)

        if episode % target_update_freq == 0:
            agent.update_target_network()

        print(
            f"Episode {episode + 1}/{num_episodes}, "
            f"Total Reward: {total_reward:.2f}, "
            f"Epsilon: {agent.epsilon:.3f}"
        )

    return rewards_per_episode


if __name__ == "__main__":
    rewards = train()

    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Training Rewards")
    plt.show()

Writing train.py


In [29]:
%%writefile evaluate.py
import sys
sys.path.append(".")

import torch
import numpy as np

from env.snake_env import SnakeEnv
from agent.dqn_agent import DQNAgent


def evaluate(num_episodes=100):
    env = SnakeEnv()
    device = "cuda" if torch.cuda.is_available() else "cpu"

    agent = DQNAgent(device=device)
    agent.epsilon = 0.0  # No exploration (pure exploitation)

    wins = 0
    scores = []

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        score = 0

        while not done:
            action = agent.select_action(state)
            state, reward, done, _ = env.step(action)

            if reward == 10:  # Ate food
                score += 1

        scores.append(score)
        if score > 0:
            wins += 1

        print(f"Episode {episode+1}: Score = {score}")

    win_rate = (wins / num_episodes) * 100
    avg_score = np.mean(scores)

    print("\nEvaluation Results")
    print("------------------")
    print(f"Win Rate: {win_rate:.2f}%")
    print(f"Average Score: {avg_score:.2f}")


if __name__ == "__main__":
    evaluate()

Writing evaluate.py


In [44]:
!zip -r RLGameMaster.zip agent env model utils train.py evaluate.py

  adding: agent/ (stored 0%)
  adding: agent/dqn_agent.py (deflated 70%)
  adding: env/ (stored 0%)
  adding: env/snake_env.py (stored 0%)
  adding: model/ (stored 0%)
  adding: model/dqn_model.py (stored 0%)
  adding: utils/ (stored 0%)
  adding: utils/replay_buffer.py (deflated 58%)
  adding: train.py (deflated 58%)
  adding: evaluate.py (deflated 53%)
