<a href="https://colab.research.google.com/github/ariquintal/machinelearning/blob/main/Reinforcement_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gym
!pip install tensorflow
!pip install --upgrade tensorflow



In [None]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.keras import layers

In [None]:
# Define the environment
class BoxSortingEnv(gym.Env):
    def __init__(self):
        super(BoxSortingEnv, self).__init__()
        self.observation_space = gym.spaces.Discrete(5)  # 5 labels (a, b, c, d, e)
        self.action_space = gym.spaces.Discrete(5)       # 5 containers (A, B, C, D, E)

    def reset(self):
        self.current_box = np.random.choice(['a', 'b', 'c', 'd', 'e'])
        self.current_container = np.random.choice(['A', 'B', 'C', 'D', 'E'])
        return self.current_box

    def step(self, action):
        # Update the container based on the chosen action
        self.current_container = ['A', 'B', 'C', 'D', 'E'][action]

        # Reward the agent for correct classification
        reward = 1 if self.current_box == self.current_container.lower() else 0

        # Generate a new box
        self.current_box = np.random.choice(['a', 'b', 'c', 'd', 'e'])

        # Return the next state, reward, and done flag

        return self.current_box, reward, False, {}

In [None]:
env = BoxSortingEnv()


# Define the Q-network model
model = tf.keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(5,)),  # Update input shape here
    layers.Dense(5, activation='linear')
])


# Define the Q-learning agent
class QLearningAgent:
    def __init__(self, model, learning_rate=0.001, gamma=0.99):
        self.model = model
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        self.gamma = gamma

    def train(self, state, action, reward, next_state):
        # Convert the state and next_state to integers
        state = ord(state) - ord('a')
        next_state = ord(next_state) - ord('a')

        # Convert the state and next_state to one-hot encoded vectors
        state_one_hot = tf.one_hot([state], depth=5)
        next_state_one_hot = tf.one_hot([next_state], depth=5)

        # Calculate the target Q-value
        target = reward + self.gamma * tf.reduce_max(self.model(next_state_one_hot))

        with tf.GradientTape() as tape:
            # Get the predicted Q-value for the current state
            predicted_values = self.model(state_one_hot, training=True)
            predicted_q_value = predicted_values[0, action]

            # Calculate the loss
            loss = tf.reduce_mean(tf.square(target - predicted_q_value))

        # Update the model weights
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))


In [None]:
# Create the Q-learning agent instance
agent = QLearningAgent(model)

# Training loop
num_episodes = 10

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        # Choose action based on epsilon-greedy strategy
        epsilon = max(0.1, 1 - episode / 800)  # Exploration-exploitation trade-off
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            action = np.argmax(agent.model.predict(tf.one_hot([state], depth=5)))

        # Take action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)

        # Train the Q-learning agent
        agent.train(state, action, reward, next_state)

        total_reward += reward
        state = next_state

        if done:
            break

    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")


Needed more time to execute the training part that is stated below

Create the Q-learning agent instance
agent = QLearningAgent(model)

Training loop
num_episodes = 10

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0

    while True:
        # Choose action based on epsilon-greedy strategy
        epsilon = max(0.1, 1 - episode / 800)  # Exploration-exploitation trade-off
        if np.random.rand() < epsilon:
            action = env.action_space.sample()  # Exploration
        else:
            action = np.argmax(agent.model.predict(tf.one_hot([state], depth=5)))

        # Take action and observe the next state and reward
        next_state, reward, done, _ = env.step(action)

        # Train the Q-learning agent
        agent.train(state, action, reward, next_state)

        total_reward += reward
        state = next_state

        if done:
            break

    if episode % 50 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward}")


As well, it could not have been executed the testing part.

Testing loop
num_test_episodes = 10

for _ in range(num_test_episodes):
    state = env.reset()

    while True:
        # Choose the best action based on learned Q-values
        action = np.argmax(agent.model.predict(tf.one_hot([state], depth=5)))

        # Take action and observe the next state
        next_state, _, done, _ = env.step(action)

        # Print the movement of the boxes
        print(f"Move box {state.upper()} to container {['A', 'B', 'C', 'D', 'E'][action]}")

        state = next_state

        if done:
            break

In [None]:
%%shell
jupyter nbconvert --to html /content/Reinforcement_learning.ipynb

[NbConvertApp] Converting notebook /content/Reinforcement_learning.ipynb to html
[NbConvertApp] Writing 608957 bytes to /content/Reinforcement_learning.html


