DEfining the Q-Learning Model

In [1]:
from tensorflow.keras.models import Sequential  #A Linear stack of layers
from tensorflow.keras.layers import Dense,Input #Dense means fully connected layers
from tensorflow.keras.optimizers import Adam

def build_model(action_size,state_size):
    model=Sequential([
        Input(shape=(state_size,)), #Input layer
        Dense(24,activation='relu'),
        Dense(24,activation='relu'),
        Dense(action_size,activation='linear') #Output layer; Linear, as we are predicting continuous Q-values.
    ])
    
    model.compile(optimizer=Adam(0.001),loss='mse') #Adam adjusts the learning rate based on gradients.
    return model

In [2]:
import gymnasium as gym
import numpy as np

env=gym.make('CartPole-v1')
# env.reset()

np.random.seed(42)
env.action_space.seed(42)
env.observation_space.seed(42)

state_size=env.observation_space.shape[0]
action_size=env.action_space.n

model=build_model(state_size, action_size)

Implementing THe Q-Learning AlgOrithm
1. Define the replay Function

In [8]:
import random
from collections import deque
import tensorflow as tf

epsilon=1 #Exploration rate
epsilon_min=0.01
epsilon_decay=0.995

# Replay memory
memory=deque(maxlen=2000)

def remember(state,action,reward,next_state,Done):
    memory.append((state,action,reward,next_state,Done)) #Store experience in memory

def replay(batch_size=64):
    # Tranin the model using a random sample of experience
    if len(memory) < batch_size:
        return

    minibatch = random.sample(memory, batch_size)

    states = np.vstack([x[0] for x in minibatch]) 
    actions = np.array([x[1] for x in minibatch])
    rewards = np.array([x[2] for x in minibatch])
    next_states = np.vstack([x[3] for x in minibatch]) 
    dones = np.array([x[4] for x in minibatch])

    # Predict Q-values for the next states 
    q_next = model.predict(next_states)
    # Predict Q-values for the current states 
    q_target = model.predict(states)

    # Vectorized update of target values
    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += 0.95 * np.amax(q_next[i])  # Discounted future reward
        q_target[i][actions[i]] = target  # Update the Q-value for the action taken

    # Train the model with updated Q-values
    model.fit(states, q_target, verbose=0, epochs=1)

    # Reduce exploration rate (epsilon) after each training step
    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

In [9]:
def act(state):
    """Choose an action based on the current state and exploration rate."""
    if np.random.rand() <= epsilon:
        return random.randrange(action_size)  # Explore: choose a random action
    act_values = model.predict(state)  # Exploit: predict action based on the state
    return np.argmax(act_values[0])  # Return the action with the highest Q-value

episodes=10
train_freq=5

for e in range(episodes):
    state, _ = env.reset()  # Unpack the tuple returned by env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(200):  # Limit to 200 time steps per episode
        action = act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        remember(state, action, reward, next_state, done)  # Store experience
        state = next_state

        if done:
            print(f"episode: {e+1}/{episodes}, score: {time}, e: {epsilon}")
            break
        
        # Train the model every 'train_frequency' steps
        if time % train_freq == 0:
            replay(batch_size=64)  

env.close()

episode: 1/10, score: 33, e: 1
episode: 2/10, score: 30, e: 1


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 2, but received input with shape (32, 4)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(32, 4), dtype=float32)
  • training=False
  • mask=None

In [10]:
import random
from collections import deque
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Hyperparameters
epsilon = 1  # Exploration rate
epsilon_min = 0.01
epsilon_decay = 0.995
state_size = 4  # Example: Environment state size
action_size = 2  # Example: Number of possible actions

# Replay memory
memory = deque(maxlen=2000)

# Build the model
model = Sequential([
    Dense(24, input_dim=state_size, activation='relu'),  # Input shape matches state_size
    Dense(24, activation='relu'),
    Dense(action_size, activation='linear')  # Output size matches action_size
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

def remember(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))  # Store experience in memory

def replay(batch_size=64):
    if len(memory) < batch_size:
        return

    # Sample a minibatch from memory
    minibatch = random.sample(memory, batch_size)

    # Extract states, actions, rewards, next_states, and dones from the minibatch
    states = np.vstack([x[0] for x in minibatch])  # Shape: (batch_size, state_size)
    actions = np.array([x[1] for x in minibatch])
    rewards = np.array([x[2] for x in minibatch])
    next_states = np.vstack([x[3] for x in minibatch])  # Shape: (batch_size, state_size)
    dones = np.array([x[4] for x in minibatch])

    # Predict Q-values for the next states in batch
    q_next = model.predict(next_states, verbose=0)
    # Predict Q-values for the current states in batch
    q_target = model.predict(states, verbose=0)

    # Vectorized update of target values
    for i in range(batch_size):
        target = rewards[i]
        if not dones[i]:
            target += 0.95 * np.amax(q_next[i])  # Discounted future reward
        q_target[i][actions[i]] = target  # Update the Q-value for the action taken

    # Train the model with updated Q-values
    model.fit(states, q_target, verbose=0, epochs=1)

    # Reduce exploration rate (epsilon) after each training step
    global epsilon
    if epsilon > epsilon_min:
        epsilon *= epsilon_decay

# Main training loop
episodes = 10
train_freq = 5

for e in range(episodes):
    state, _ = env.reset()  # Unpack the tuple returned by env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(200):  # Limit to 200 time steps per episode
        action = act(state)
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        remember(state, action, reward, next_state, done)  # Store experience
        state = next_state

        if done:
            print(f"episode: {e+1}/{episodes}, score: {time}, e: {epsilon}")
            break

        # Train the model every 'train_frequency' steps
        if time % train_freq == 0:
            replay(batch_size=64)  # Call replay with larger batch size for efficiency

env.close()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


episode: 1/10, score: 30, e: 1
episode: 2/10, score: 29, e: 1
episode: 3/10, score: 18, e: 0.985074875
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
episode: 4/10, score: 42, e: 0.9416228069143757
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
episode: 5/10, score: 17, e: 0.9229311239742362
episode: 6/10, score: 17, e: 0.9046104802746175
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
episode: 7/10, score: 12, e: 0.8911090557802088
episode: 8/10, score: 18, e: 0.8734200960253871
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
episode: 

In [12]:
for e in range(10):  

    state, _ = env.reset()  # Unpack the state from the tuple 
    state = np.reshape(state, [1, state_size])  # Reshape the state correctly 
    for time in range(500):  
        env.render()  
        action = np.argmax(model.predict(state)[0])  
        next_state, reward, terminated, truncated, _ = env.step(action)  # Unpack the five return values 
        done = terminated or truncated  # Check if the episode is done 
        next_state = np.reshape(next_state, [1, state_size])  
        state = next_state  
        if done:  
            print(f"episode: {e+1}/10, score: {time}")  
            break  

env.close() 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step


  gym.logger.warn(


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
episode: 1/10, score: 7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[