# Project RL4Demozelle


## Create Communication to MQTT-Broker via Paho


In [1]:
import paho.mqtt.client as mqtt
import ssl


In [2]:
class MqttClient():
    def __init__(self, username, password, server_address, port, topics):
        self.username = username
        self.password = password
        self.server_address = server_address
        self.port = port
        self.topics = topics
        self.last_received_values = {}  # Dictionary to store values of every topic
        self.message_received = False

        # initialize mqtt client
        self.client = mqtt.Client()
        self.client.on_connect = self.on_connect
        self.client.on_message = self.on_message

        # enable tls for secure connection
        self.client.tls_set(tls_version=ssl.PROTOCOL_TLS)

        # set username and password
        self.client.username_pw_set(self.username, self.password)

        # connect to the broker
        self.client.connect(self.server_address, self.port, keepalive=60)

        # subscribe to the topics
        for topic in self.topics:
            self.client.subscribe(topic)
            # Initialisierung für jedes Topic
            self.last_received_values[topic] = None

        # start loop to wait for messages
        self.client.loop_start()

    # The callback for when the client receives a CONNACK response from the server.
    def on_connect(self, client, userdata, flags, rc):
        print("Connected with result code " + str(rc))

    # The callback for when a PUBLISH message is received from the server.
    def on_message(self, client, userdata, msg):
        self.message_received = True
        received_value = float(msg.payload.decode('utf-8').replace(',', '.'))
        topic = msg.topic
        # print(f'Received Value for Topic {topic}: {received_value}')
        self.last_received_values[topic] = received_value
        # print(self.last_received_values[topic])

        return self.last_received_values[topic]

    def publish(self, topic, value, qos=0, retain=False):
        # publish value for a specific topic
        self.client.publish(topic, payload=float(value),
                            qos=qos, retain=retain)
        # print(f"Published values on topic {topic}: {value}")


In [3]:
USERNAME = 'UiN7dRqkM4ZA'
PASSWORD = 'YGmSKvebsz7V..V'
SERVER_ADDRESS = '3c4fdc3e60c54d06b6252a909b39100e.s2.eu.hivemq.cloud'
SERVER_PORT = 8883


## Create RL Environment via OpenAi Gym library


In [4]:
import gymnasium as gym
import numpy as np
from time import sleep
from scipy.interpolate import CubicSpline
import time


In [5]:
class DemocellEnv(gym.Env):
    def __init__(self):
        # Definition of inital speed values
        self._conveyor_speed_1 = 0
        self._conveyor_speed_2 = 0

        # Definition of the speed and centricity threshhold
        self._conveyor_speed_1_interval = (300, 1100)
        self._conveyor_speed_2_interval = (300, 1100)
        self._centrcity_threshold = 0.25

        # List of rewards
        self.rewards = []

        # List of time per steps
        self.list_of_time_per_step = []

        # MQTT-Server credentials
        self.username = USERNAME
        self.password = PASSWORD
        self.server_address = SERVER_ADDRESS
        self.server_port = SERVER_PORT

        # MQTT Topics for subscribing
        self.subscriber_topics = ['Democell/Quality/Centricity']

        # Initialize of the mqtt client
        self.mqtt_client = MqttClient(username=self.username, password=self.password,
                                      server_address=self.server_address, port=self.server_port, topics=self.subscriber_topics)

        # Definition of the action space
        # Action space with 2 discrete speed values for each conveyor (increase or decrease speed); in total 4 dicrete values
        self.action_space = gym.spaces.Discrete(4)

        # Definition of the observation space
        self.observation_space = gym.spaces.Dict({
            # Centricity value in an interval between 0.0 and 1.0, with 0.0 is not centered at all and 1.0 is perfectly centered
            'centricity': gym.spaces.Box(low=0, high=1, shape=(1,), dtype=float),
            # Speed values for conveyor speed 1 between 150 and 1100
            'conveyor_speed_1': gym.spaces.Box(low=self._conveyor_speed_1_interval[0], high=self._conveyor_speed_1_interval[1], shape=(1,), dtype=float),
            # Speed values for conveyor speed 2 between 150 and 1100
            'conveyor_speed_2': gym.spaces.Box(low=self._conveyor_speed_2_interval[0], high=self._conveyor_speed_2_interval[1], shape=(1,), dtype=float),
        })

        '''
        The following dictionay maps the abstract actions from 'self.action_space' to the speed action it will take.
        So that 0 corresponds to 'increase' conveyor speed, whereas 1 corresponds to 'decrease' conveyor speed.
        0 & 1: Conveyor speed 1 will be changed
        2 & 3: Conveyor speed 2 will be changed
        '''
        self._action_to_speed_change = {
            0: float(-50),  # Decrease conveyor_speed_1
            1: float(50),   # Increase conveyor_speed_1
            2: float(-50),  # Decrease conveyor_speed_2
            3: float(50)    # Increase conveyor_speed_2
        }

    # Returns an array of speed values for _conveyor_speed_1, _conveyor_speed_2 and centricity
    def _get_obs(self):
        return {
            'conveyor_speed_1': np.array([self._conveyor_speed_1], dtype=float),
            'conveyor_speed_2': np.array([self._conveyor_speed_2], dtype=float),
            'centricity': np.array([self._centricity], dtype=float)
        }

    def _get_info(self):
        return {
            'conveyor_speed_1': self._conveyor_speed_1,
            'conveyor_speed_2': self._conveyor_speed_2,
            'centricity': self._centricity
        }

    def reset(self, seed=None, options=None):
        '''
        Resets the values for Conveyor_speed_1 to a random number between the max speed value and min speed value from conveyor_speed_1_interval
        '''
        # Inherit from seed.np_random to set seed
        super().reset(seed=seed)

        # Reset current_step and centricity value
        self.current_step = 0
        # self._centricity = self.mqtt_client.last_received_values['Democell/Quality/Centricity']
        self._centricity = None

        # Set speed values for conveyor 1 & 2 to random speed between minimal and maximal speed and publish the values to mqtt
        self._conveyor_speed_1 = self.np_random.uniform(
            low=self._conveyor_speed_1_interval[0], high=self._conveyor_speed_1_interval[1], size=1).astype(float)
        self._conveyor_speed_1 = self._conveyor_speed_1[0]
        self._conveyor_speed_2 = self.np_random.uniform(
            low=self._conveyor_speed_2_interval[0], high=self._conveyor_speed_2_interval[1], size=1).astype(float)
        self._conveyor_speed_2 = self._conveyor_speed_2[0]

        # Get back the observation parameters and additional infos
        observation = self._get_obs()
        info = self._get_info()

        return observation, info

    def step(self, action):
        '''
        For the step the action from the action space is mapped. For action 0 in this step the speed will be reduced by 0.01 for action 1 the speed will be increased by 0.01.
        Also after setting the speed to an increased or decreased speed, this function will wait for some seconds and read out the corresponding centricity vlaue.
        After getting the centrcity value the function will then calculate a reward based on the speed and centricty values.
        '''

        # Start the step
        start_time = time.time()
        # Map the abstract action to the change in speed
        speed_change = self._action_to_speed_change[action]

        # Execute action depending on the Discrete number of the action space: for 0 & 1 Conveyor Speed 1 will be changed, for 3 & 4 Conveyor Speed 2 will be changed
        # # Use 'np.clip' to make sure we do not leave the speed intervals
        action_list = int(action)
        if action_list in [0, 1]:  # Actions 0 and 1 correspond to conveyor_speed_1
            # self._conveyor_speed_1 = float(np.clip(self._conveyor_speed_1 + speed_change, self._conveyor_speed_1_interval[0], self._conveyor_speed_1_interval[1]))
            self._conveyor_speed_1 += speed_change
        else:  # Actions 2 and 3 correspond to conveyor_speed_2
            # self._conveyor_speed_2 = float(np.clip(self._conveyor_speed_2 + speed_change, self._conveyor_speed_2_interval[0], self._conveyor_speed_2_interval[1]))
            self._conveyor_speed_2 += speed_change

        # Convert speed values to speed values in m/s
        _conveyor_speed_1_converted = self.calc_speed_value_to_ms(
            self._conveyor_speed_1)
        print(f'Conveyor Speed 1: {self._conveyor_speed_1}')
        _conveyor_speed_2_converted = self.calc_speed_value_to_ms(
            self._conveyor_speed_2)
        print(f'Conveyor Speed 2: {self._conveyor_speed_2}')

        # Publish new speed values to the mqtt server
        self.mqtt_client.publish('Democell/Speed/Conveyor_Speed_1',
                                 value=_conveyor_speed_1_converted, qos=0, retain=False)
        self.mqtt_client.publish('Democell/Speed/Conveyor_Speed_2',
                                 value=_conveyor_speed_2_converted, qos=0, retain=False)

        # Wait for new centricity value
        self.mqtt_client.client.loop_start()
        self.mqtt_client.message_received = False
        while not self.mqtt_client.message_received:
            sleep(2)
            if self.mqtt_client.message_received == True:
                self.mqtt_client.message_received = False
                break

        self._centricity = self.mqtt_client.last_received_values['Democell/Quality/Centricity']
        print(f'Centricity: {self._centricity}')
        print('_________________________________________________________')

        reward = self.calculate_reward(
            self._centricity, self._conveyor_speed_1, self._conveyor_speed_2)

        self.rewards.append(round(reward, 3))

        # Episode terminates after 10 steps
        self.max_steps_per_episode = 2
        self.current_step += 1
        if self.current_step >= self.max_steps_per_episode:
            terminated = True
        else:
            terminated = False
        # Calculate the time of each step
        time_of_step = time.time() - start_time
        self.list_of_time_per_step.append(time_of_step)

        observation = self._get_obs()
        info = self._get_info()

        return observation, reward, terminated, False, info

    def calculate_reward(self, centricity, speed_1, speed_2):
        # Define the reward function; For centricity values above threshold give strong negative reward
        if centricity > self._centrcity_threshold:
            reward = -1
        else:
            reward = speed_1 + speed_2

        return reward

    def calc_speed_value_to_ms(self, speed_value):
        # Datapoint from Excel 'Bandgeschwindigkeiten_Demozelle', Column: eingestellte Geschwindigkeit [1]; Column: Bandgeschwindigkeit [m/s]
        geschwindigkeiten = np.array([150, 200, 250, 300, 350, 400, 450, 500,
                                     550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100])
        bandgeschwindigkeiten = np.array([0.021, 0.039, 0.058, 0.076, 0.093, 0.114, 0.131, 0.148,
                                         0.165, 0.184, 0.203, 0.218, 0.236, 0.262, 0.276, 0.293, 0.308, 0.321, 0.323, 0.328])

        # Calculate cubic function
        interpolation = CubicSpline(geschwindigkeiten, bandgeschwindigkeiten)

        # Interpolation with cubic-function for speed values to metre/sec
        result = interpolation(speed_value)

        return result


In [6]:
from stable_baselines3 import DQN
import logging
import matplotlib.pyplot as plt


In [15]:
# Instantiate custom gym RL environment
env = DemocellEnv()

# Hyperparameters fpr DQN-Agent
LR = 1e-2
BUFFER_SIZE = 0
LEARNING_STARTS = 0
BATCH_SIZE = 0
GAMMA = 0.95

# Instantiate stablebaselines3 DQN Agent; As policy 'MultiInputPolicy' is needed sind we use a dictionary as observation space
DQNAgent = DQN(policy='MultiInputPolicy', env=env, verbose=0, learning_rate=LR,
               learning_starts=LEARNING_STARTS, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE, gamma=GAMMA)
# Learning with the DQN-Agent
DQNAgent.learn(total_timesteps=2, log_interval=10)
DQNAgent.set_random_seed(42)
obs, info = env.reset(seed=42)

# Used for logging information about each episode; it will debu
logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s [%(levelname)s] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
# Create a FileHandler to save log info in a text file
file_handler = logging.FileHandler('result.txt', mode='a')
file_handler.setLevel(logging.DEBUG)  # Set the level as desired

logger.addHandler(file_handler)
eps_counter = 0
step_counter = 0
counter = None
# Loop for teaching the model; here it will explore and exploit the gym custom environment
while True:
    print("start")
    action, _states = DQNAgent.predict(obs, deterministic=True)
    action = int(action)
    obs, reward, terminated, test, info = env.step(action)

    # Log information to the notebook; it will also count the episodes and the used steps of each episode
    eps_counter += 1
    step_counter += 1
    plt.clf()
    # If Terminated, exit the loop and end the training
    if terminated:
       # Extract episode rewards from the Monitor
        episode_rewards = env.rewards
        duration_for_steps = env.list_of_time_per_step

        # Create a figure with two subplots
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(
            10, 8), sharex=True, facecolor='white')

        # Plot Total Rewards on the first subplot
        color = 'tab:blue'
        ax1.set_ylabel('Total Reward', color=color)
        ax1.plot(episode_rewards, color=color, label='Total Reward')
        ax1.tick_params(axis='y', labelcolor=color)

        # Plot Duration for Each Step on the second subplot
        color = 'tab:orange'
        ax2.set_xlabel('Steps')
        ax2.set_ylabel('Duration', color=color)
        ax2.plot(duration_for_steps, color=color, linestyle='--',
                 label='Time steps for each step')
        ax2.tick_params(axis='y', labelcolor=color)

        # Title and legends
        fig.suptitle('Total Reward and Duration for Each Step')
        # Adjust the layout to avoid title overlap
        fig.tight_layout(rect=[0, 0, 1, 0.96])
        fig.legend(loc='upper right')

        # Save the plot
        plt.savefig(
            f'Reward_and_Duration for episode {eps_counter}.png', facecolor='white')
        plt.clf()

        # Example usage of the logger
        if (counter is None): 
            logger.warning(
                f'Learning Rate : {LR}, BUFFER_SIZE : {BUFFER_SIZE}, Learning_starts:{LEARNING_STARTS}, batch size: {BATCH_SIZE}')
            logger.info(
                f"Episode: {eps_counter}, Step: {step_counter}, Reward: {reward}, Speed 1: {info['conveyor_speed_1']}, Speed 2: {info['conveyor_speed_2']}, Centricity: {info['centricity']} \n" + 60*"--")
            counter = 1

        obs, info = env.reset()
        print("Episode terminated. Exiting the program.")
        break
# Save model to the same directory
DQNAgent.save('DQN_Democell')


Conveyor Speed 1: 537.646292788247
Conveyor Speed 2: 753.0315260910389
Connected with result code 0
Centricity: 0.05340784
_________________________________________________________
Conveyor Speed 1: 587.646292788247
Conveyor Speed 2: 753.0315260910389
Centricity: 0.04110626
_________________________________________________________
Conveyor Speed 1: 273.8546851974731
Conveyor Speed 2: 782.9846958676707
Centricity: 0.05299375
_________________________________________________________
Conveyor Speed 1: 273.8546851974731
Conveyor Speed 2: 832.9846958676707
Centricity: 0.03267279
_________________________________________________________
start
Conveyor Speed 1: 869.1648388447707
Conveyor Speed 2: 651.1027518016418
Centricity: 0.06484357
_________________________________________________________
start
Conveyor Speed 1: 919.1648388447707
Conveyor Speed 2: 651.1027518016418


2024-01-31 16:53:50 [DEBUG] top of Axes not in the figure, so title not moved


Centricity: 0.04603009
_________________________________________________________


2024-01-31 16:53:50 [DEBUG] top of Axes not in the figure, so title not moved
2024-01-31 16:53:50 [DEBUG] top of Axes not in the figure, so title not moved
2024-01-31 16:53:50 [DEBUG] top of Axes not in the figure, so title not moved
2024-01-31 16:53:51 [INFO] Episode: 2, Step: 2, Reward: 1570.2675906464124, Speed 1: 919.1648388447707, Speed 2: 651.1027518016418, Centricity: 0.04603009 
------------------------------------------------------------------------------------------------------------------------


Episode terminated. Exiting the program.


<Figure size 432x288 with 0 Axes>

<Figure size 720x576 with 0 Axes>

## Create RL Agent via PyTorch


### Configuration of matplotlib


In [None]:
import matplotlib
import matplotlib.pyplot as plt


In [None]:
# Set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()


<contextlib.ExitStack at 0x260ef5c3dd0>

### Replay memory


In [None]:
from collections import namedtuple, deque
import random


In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        # Save a transition
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


### DQN algorithm


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [None]:
class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...])s.
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)


## Training


### Hyperparameters and utilities


In [None]:
# BATCH_SIZE is the number of transitions sampled from the replay buffer
# GAMMA is the discount factor as mentioned in the previous section
# EPS_START is the starting value of epsilon
# EPS_END is the final value of epsilon
# EPS_DECAY controls the rate of exponential decay of epsilon, higher means a slower decay
# TAU is the update rate of the target network
# LR is the learning rate of the ``AdamW`` optimizer
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4


In [None]:
env = DemocellEnv()
env.reset()

# Get number of actions from gym action space
n_actions = env.action_space.n

# Get number of state observations
state, info = env.reset()
n_observations = len(state)


Published values on topic Democell/Conveyor_Speed_1: 0.3935546875
Published values on topic Democell/Conveyor_Speed_1: 0.464599609375


In [None]:
# if GPU is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

policy_net = DQN(n_observations, n_actions).to(device)
target_net = DQN(n_observations, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())


<All keys matched successfully>

In [None]:
optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)


Connected with result code 0


In [None]:
import math


In [None]:
steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)


In [None]:
episode_durations = []


def plot_durations(show_result=False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training...')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    # Take 100 episode averages and plot them too
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)  # pause a bit so that plots are updated
    if is_ipython:
        if not show_result:
            display.display(plt.gcf())
            display.clear_output(wait=True)
        else:
            display.display(plt.gcf())


### Training loop


In [None]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                            batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                       if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1).values
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(
            non_final_next_states).max(1).values
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values,
                     expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # In-place gradient clipping
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()


In [None]:
from itertools import count


In [None]:
if torch.cuda.is_available():
    num_episodes = 600
else:
    num_episodes = 50

for i_episode in range(num_episodes):
    # Initialize the environment and get it's state
    state, info = env.reset()
    state = torch.tensor(state, dtype=torch.float32,
                         device=device).unsqueeze(0)
    for t in count():
        action = select_action(state)
        observation, reward, terminated, truncated, _ = env.step(action.item())
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            next_state = torch.tensor(
                observation, dtype=torch.float32, device=device).unsqueeze(0)

        # Store the transition in memory
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()

        # Soft update of the target network's weights
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key] * \
                TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)

        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break

print('Complete')
plot_durations(show_result=True)
plt.ioff()
plt.show()


Published values on topic Democell/Conveyor_Speed_1: 0.63427734375


TypeError: must be real number, not dict