In [4]:
%pip install -q gymnasium torch matplotlib

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Check if ur system has gpu

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# If using GPU, print some info about it
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using device: cuda
GPU Name: NVIDIA RTX 2000 Ada Generation
GPU Memory: 16.8 GB


# Load the Environment

In [6]:
import gymnasium as gym
env = gym.make("CartPole-v1")


In [7]:
env.action_space.n == 2

True

In [9]:
state, _ = env.reset()
print("State shape:", state.shape)


State shape: (4,)


In [10]:
# Reset the environment to get initial observation
observation, info = env.reset()

print("Observation space:", env.observation_space)
print("Action space:", env.action_space)
print("Initial observation:", observation)
print("Observation shape:", observation.shape)

Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Action space: Discrete(2)
Initial observation: [ 0.04235871 -0.02958577 -0.0320276  -0.02347192]
Observation shape: (4,)


# Taking random actions over the environment

In [12]:
import gymnasium as gym
import numpy as np

env = gym.make("CartPole-v1")
observation, info = env.reset()

print("=== Taking 5 random steps ===")
for step in range(5):
    # Take a random action
    action = env.action_space.sample()  # randomly choose 0 or 1
    
    # Execute the action
    observation, reward, terminated, truncated, info = env.step(action)
    
    print(f"Step {step+1}:")
    print(f"  Action taken: {action} ({'Left' if action == 0 else 'Right'})")
    print(f"  New observation: {observation}")
    print(f"  Reward: {reward}")
    print(f"  Episode ended: {terminated or truncated}")
    print()
    
    # If episode ends, reset
    if terminated or truncated:
        print("Episode ended! Resetting...")
        observation, info = env.reset()
        break

env.close()

=== Taking 5 random steps ===
Step 1:
  Action taken: 0 (Left)
  New observation: [-0.02022828 -0.15586087 -0.02850062  0.2861497 ]
  Reward: 1.0
  Episode ended: False

Step 2:
  Action taken: 1 (Right)
  New observation: [-0.02334549  0.0396557  -0.02277763 -0.01538409]
  Reward: 1.0
  Episode ended: False

Step 3:
  Action taken: 0 (Left)
  New observation: [-0.02255238 -0.15513231 -0.02308531  0.2700261 ]
  Reward: 1.0
  Episode ended: False

Step 4:
  Action taken: 1 (Right)
  New observation: [-0.02565503  0.04031134 -0.01768479 -0.02984775]
  Reward: 1.0
  Episode ended: False

Step 5:
  Action taken: 1 (Right)
  New observation: [-0.0248488   0.23568238 -0.01828174 -0.32805753]
  Reward: 1.0
  Episode ended: False



# ------------------------------------------------------------
# ✅ Deep Q-Learning Summary: What are we learning?
#
# - We are learning the Q-function: Q(s, a)
#     → Tells us: "If I take action a in state s, how much total future reward can I expect?"
#
# - The policy is not learned directly.
#     → At runtime, we pick the action with the highest Q-value:
#         policy(s) = argmax_a Q(s, a)
#
# - Goal: Learn accurate Q-values so we can act optimally by just picking the best action.
# ------------------------------------------------------------


# ------------------------------------------------------------
# 📘 Bellman Equation and Q-Value Learning (Value-Based RL)
#
# Richard Bellman introduced dynamic programming, which helps us
# solve problems recursively by using solutions to subproblems.
#
# In Q-learning, we estimate the Q-value (expected return from
# taking action 'a' in state 's') using the Bellman equation:
#
#     Q(s, a) ← r + γ * max_a' Q(s', a')
#
# Where:
# - r is the immediate reward after taking action a in state s
# - γ (gamma) is the discount factor for future rewards
# - s' is the new state after taking action a
# - max_a' Q(s', a') is the best future reward we can expect
#
# The goal: Learn good Q(s, a) values so that we can act using:
#     policy(s) = argmax_a Q(s, a)
#
# This is the foundation of Value-Based Reinforcement Learning.
# ------------------------------------------------------------

# ------------------------------------------------------------
# ❗ Why Tabular Q-Learning Doesn't Work for CartPole
#
# - The state in CartPole is a vector of 4 continuous values:
#     [position, velocity, pole angle, pole angular velocity]
#
# - There are infinitely many possible states (real numbers),
#   so we cannot build a Q-table with one row per state.
#
# ✅ Solution: Use a Deep Neural Network (DQN)
# - Input: the continuous state vector (4 floats)
# - Output: Q-values for each possible action (e.g., [Q_left, Q_right])
#
# This lets us generalize across similar states using function approximation.
# ------------------------------------------------------------
