In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 9.666666666666666
All assignments history: [21, 15, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -364     |
| time/              |          |
|    fps             | 175      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -290.0
Standard deviation of reward: 0.0
Average successful assignments: 29.208333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 161         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008452786 |
|    clip_fraction        | 0.0572      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.392      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.83        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: -320.0
Standard deviation of reward: 0.0
Average successful assignments: 47.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 10          |
|    time_elapsed         | 83          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019808877 |
|    clip_fraction        | 0.424       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.00504     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.723       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0621     |
|    value_loss           | 4.59        |
-

-------- Rollout Summary --------
Total mean reward: -308.0
Standard deviation of reward: 0.0
Average successful assignments: 37.541666666666664
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -370       |
| time/                   |            |
|    fps                  | 107        |
|    iterations           | 18         |
|    time_elapsed         | 171        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02493783 |
|    clip_fraction        | 0.557      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.103      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.389     |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0635    |
|    value_loss           | 2.83       |
-------

-------- Rollout Summary --------
Total mean reward: -104.0
Standard deviation of reward: 0.0
Average successful assignments: 57.68269230769231
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 101         |
|    iterations           | 26          |
|    time_elapsed         | 262         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020913351 |
|    clip_fraction        | 0.424       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.344       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.354       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0647     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -66.0
Standard deviation of reward: 0.0
Average successful assignments: 78.58823529411765
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 98          |
|    iterations           | 34          |
|    time_elapsed         | 354         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.013893138 |
|    clip_fraction        | 0.25        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.947       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0539     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 93.15674603174604
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 42          |
|    time_elapsed         | 449         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.015000613 |
|    clip_fraction        | 0.272       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.488       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0601     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 103.86833333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 50          |
|    time_elapsed         | 549         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.014912229 |
|    clip_fraction        | 0.262       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.497       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.936       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0593     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 111.875
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 58          |
|    time_elapsed         | 643         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.014634388 |
|    clip_fraction        | 0.253       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.549       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.759       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.059      |
|    value_loss           | 2.22        |

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 118.68560606060606
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 66          |
|    time_elapsed         | 750         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.015450563 |
|    clip_fraction        | 0.292       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.59        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.394       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0636     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 124.509009009009
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 74          |
|    time_elapsed         | 847         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.014456153 |
|    clip_fraction        | 0.276       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.14       |
|    explained_variance   | 0.55        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.807       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0661     |
|    value_loss           | 2.03 

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 128.5701219512195
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 82          |
|    time_elapsed         | 942         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.016158968 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.04       |
|    explained_variance   | 0.595       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.358       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0696     |
|    value_loss           | 1.9

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 133.67777777777778
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -354      |
| time/                   |           |
|    fps                  | 88        |
|    iterations           | 90        |
|    time_elapsed         | 1041      |
|    total_timesteps      | 92160     |
| train/                  |           |
|    approx_kl            | 0.0169486 |
|    clip_fraction        | 0.346     |
|    clip_range           | 0.15      |
|    entropy_loss         | -7.9      |
|    explained_variance   | 0.586     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.192     |
|    n_updates            | 890       |
|    policy_gradient_loss | -0.0757   |
|    value_loss           | 2.07      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 140.02636054421768
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 98          |
|    time_elapsed         | 1156        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016379552 |
|    clip_fraction        | 0.344       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.6        |
|    explained_variance   | 0.564       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.257       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.076      |
|    value_loss           | 2.

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 20.583333333333332
All assignments history: [15, 12, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 91       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -354.0
Standard deviation of reward: 0.0
Average successful assignments: 20.833333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -375        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008593367 |
|    clip_fraction        | 0.0608      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.256      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.99        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 36.7
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 137         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018469337 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.00228     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.732       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0594     |
|    value_loss           | 4.3         |
--

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 79.35648148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 246         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.026906352 |
|    clip_fraction        | 0.551       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0909      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.558       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0568     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: -64.0
Standard deviation of reward: 0.0
Average successful assignments: 97.88782051282051
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 26          |
|    time_elapsed         | 365         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020142736 |
|    clip_fraction        | 0.389       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.23        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.95        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0578     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: -80.0
Standard deviation of reward: 0.0
Average successful assignments: 106.90686274509804
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 34          |
|    time_elapsed         | 478         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018468399 |
|    clip_fraction        | 0.329       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.295       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.491       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0569     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 113.89682539682539
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 42          |
|    time_elapsed         | 591         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019558996 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.356       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.84        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0642     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 123.45166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 50          |
|    time_elapsed         | 703         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.018552497 |
|    clip_fraction        | 0.359       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.409       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.121       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0628     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 131.42241379310346
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 58          |
|    time_elapsed         | 824         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.019286176 |
|    clip_fraction        | 0.359       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.426       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.33        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0612     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 137.12247474747474
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 66          |
|    time_elapsed         | 942         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.020409495 |
|    clip_fraction        | 0.373       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.445       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.911       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.061      |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 141.26013513513513
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 74          |
|    time_elapsed         | 1061        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017910227 |
|    clip_fraction        | 0.338       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.422       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0764     |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0619     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 144.9369918699187
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 82          |
|    time_elapsed         | 1178        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018808149 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.48        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.116       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0623     |
|    value_loss           | 2.45

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 148.1851851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 90          |
|    time_elapsed         | 1297        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019418906 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.447       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.817       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0645     |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 150.99659863945578
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 98          |
|    time_elapsed         | 1417        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020774763 |
|    clip_fraction        | 0.433       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.426       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.694       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0687     |
|    value_loss           | 2.6

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.0
All assignments history: [15, 9, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -376     |
| time/              |          |
|    fps             | 87       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -366.0
Standard deviation of reward: 0.0
Average successful assignments: 14.291666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009014997 |
|    clip_fraction        | 0.0685      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.12       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0526     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 46.9
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 69          |
|    iterations           | 10          |
|    time_elapsed         | 147         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019487405 |
|    clip_fraction        | 0.422       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.00266     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0632     |
|    value_loss           | 4.37        |
-

-------- Rollout Summary --------
Total mean reward: -154.0
Standard deviation of reward: 0.0
Average successful assignments: 58.629629629629626
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -374       |
| time/                   |            |
|    fps                  | 69         |
|    iterations           | 18         |
|    time_elapsed         | 264        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02823316 |
|    clip_fraction        | 0.583      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.28      |
|    explained_variance   | 0.0479     |
|    learning_rate        | 0.00018    |
|    loss                 | -0.015     |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0607    |
|    value_loss           | 2.91       |
-------

-------- Rollout Summary --------
Total mean reward: -146.0
Standard deviation of reward: 0.0
Average successful assignments: 67.91025641025641
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 26          |
|    time_elapsed         | 376         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020952586 |
|    clip_fraction        | 0.417       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.214       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.125       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.061      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 82.45343137254902
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 34          |
|    time_elapsed         | 474         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019400863 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.292       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 100.5952380952381
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 42          |
|    time_elapsed         | 564         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.020965926 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.335       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.02        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0626     |
|    value_loss           | 2.82

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 114.32666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 50          |
|    time_elapsed         | 652         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.022072688 |
|    clip_fraction        | 0.437       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.363       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.582       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0663     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 123.125
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 58          |
|    time_elapsed         | 738         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.020554017 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.371       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.546       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0627     |
|    value_loss           | 2.7         |


-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 128.9419191919192
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 66          |
|    time_elapsed         | 823         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018171921 |
|    clip_fraction        | 0.372       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.351       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.252      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0641     |
|    value_loss           | 2.67 

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 133.87387387387386
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 74          |
|    time_elapsed         | 906         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.020336378 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.913       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0664     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 138.33536585365854
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -361       |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 82         |
|    time_elapsed         | 990        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01894705 |
|    clip_fraction        | 0.383      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.12      |
|    explained_variance   | 0.447      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.05       |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.069     |
|    value_loss           | 2.6        |
---------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 142.5888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 90          |
|    time_elapsed         | 1080        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.017766915 |
|    clip_fraction        | 0.374       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.99       |
|    explained_variance   | 0.404       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.535       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.59

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 146.48469387755102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -349        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1171        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020625148 |
|    clip_fraction        | 0.427       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.81       |
|    explained_variance   | 0.457       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.795       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0765     |
|    value_loss           | 2.6

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -360.0
Standard deviation of reward: 0.0
Average successful assignments: 18.75
All assignments history: [15, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -375     |
| time/              |          |
|    fps             | 111      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -274.0
Standard deviation of reward: 0.0
Average successful assignments: 36.833333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 101         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008114351 |
|    clip_fraction        | 0.0546      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.229      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.99        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0488     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -142.0
Standard deviation of reward: 0.0
Average successful assignments: 43.1
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 10          |
|    time_elapsed         | 110         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020152409 |
|    clip_fraction        | 0.45        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0144      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.547       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0661     |
|    value_loss           | 4.15        |
-

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 79.15277777777777
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 92         |
|    iterations           | 18         |
|    time_elapsed         | 198        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02940902 |
|    clip_fraction        | 0.599      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.28      |
|    explained_variance   | 0.0748     |
|    learning_rate        | 0.00018    |
|    loss                 | 0.235      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.066     |
|    value_loss           | 2.96       |
---------

-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 96.35576923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 26          |
|    time_elapsed         | 290         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.022142565 |
|    clip_fraction        | 0.456       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.187       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.193       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0595     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 103.03921568627452
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 34          |
|    time_elapsed         | 378         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019669576 |
|    clip_fraction        | 0.379       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.242       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0601     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 115.2936507936508
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 42          |
|    time_elapsed         | 468         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019203287 |
|    clip_fraction        | 0.358       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.311       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.669       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0574     |
|    value_loss           | 2.76

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 123.59333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 50          |
|    time_elapsed         | 558         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016818505 |
|    clip_fraction        | 0.319       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.278       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.183      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0587     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 130.19971264367817
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 58          |
|    time_elapsed         | 648         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.013860701 |
|    clip_fraction        | 0.239       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.338       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0521     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 135.8181818181818
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 66          |
|    time_elapsed         | 738         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016062455 |
|    clip_fraction        | 0.276       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.403       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.179      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 141.6858108108108
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 74          |
|    time_elapsed         | 829         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017947096 |
|    clip_fraction        | 0.354       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.361       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.343       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0625     |
|    value_loss           | 2.73

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 148.0030487804878
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 82          |
|    time_elapsed         | 920         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018189047 |
|    clip_fraction        | 0.342       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.349       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.161       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0674     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 154.07592592592593
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 90          |
|    time_elapsed         | 1011        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019800628 |
|    clip_fraction        | 0.421       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.11       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0249      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0724     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 158.7372448979592
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -353        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 98          |
|    time_elapsed         | 1100        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.018298142 |
|    clip_fraction        | 0.403       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.01       |
|    explained_variance   | 0.368       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0732     |
|    value_loss           | 2.51

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.333333333333333
All assignments history: [13, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 117      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -300.0
Standard deviation of reward: 0.0
Average successful assignments: 25.583333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 2           |
|    time_elapsed         | 18          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008654668 |
|    clip_fraction        | 0.0577      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.169      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.97        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0521     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 71.34166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 10          |
|    time_elapsed         | 107         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018883113 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0102      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.556       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0597     |
|    value_loss           | 4.44

-------- Rollout Summary --------
Total mean reward: -48.0
Standard deviation of reward: 0.0
Average successful assignments: 111.94907407407408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 18          |
|    time_elapsed         | 203         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025569115 |
|    clip_fraction        | 0.528       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.188       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0636     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 110.71153846153847
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 26          |
|    time_elapsed         | 294         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.023249269 |
|    clip_fraction        | 0.454       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.315       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.877       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.065      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 123.74019607843137
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -371       |
| time/                   |            |
|    fps                  | 90         |
|    iterations           | 34         |
|    time_elapsed         | 386        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.02205086 |
|    clip_fraction        | 0.426      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.25      |
|    explained_variance   | 0.337      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.0136    |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0643    |
|    value_loss           | 3.05       |
--------

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 133.02579365079364
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 42          |
|    time_elapsed         | 483         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019164884 |
|    clip_fraction        | 0.352       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.422       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.824       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0615     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 144.25333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 50          |
|    time_elapsed         | 578         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016699158 |
|    clip_fraction        | 0.297       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.451       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0584     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 153.16522988505747
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 58          |
|    time_elapsed         | 675         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.019709652 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.403       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.128      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0653     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 159.16414141414143
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 772         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016675727 |
|    clip_fraction        | 0.335       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.423       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.696       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0646     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 164.77364864864865
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 74          |
|    time_elapsed         | 868         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.021160347 |
|    clip_fraction        | 0.437       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.445       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.577       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0699     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 170.22052845528455
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 82          |
|    time_elapsed         | 964         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.021722734 |
|    clip_fraction        | 0.432       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.08       |
|    explained_variance   | 0.394       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.59        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0708     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 175.62037037037038
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -352       |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 90         |
|    time_elapsed         | 1059       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.02380855 |
|    clip_fraction        | 0.487      |
|    clip_range           | 0.15       |
|    entropy_loss         | -7.91      |
|    explained_variance   | 0.331      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.12       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0815    |
|    value_loss           | 3.06       |
--------

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 180.43707482993196
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -342        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 98          |
|    time_elapsed         | 1155        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.022100594 |
|    clip_fraction        | 0.475       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.68       |
|    explained_variance   | 0.23        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.453       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0809     |
|    value_loss           | 3.

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 9.75
All assignments history: [12, 15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 103      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 14.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -378        |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008774238 |
|    clip_fraction        | 0.0614      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | 0.0881      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.51        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 17          |


-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 32.68333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 10          |
|    time_elapsed         | 109         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019254882 |
|    clip_fraction        | 0.42        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0125      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.66        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 4.5

-------- Rollout Summary --------
Total mean reward: -76.0
Standard deviation of reward: 0.0
Average successful assignments: 68.22222222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 18          |
|    time_elapsed         | 204         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.026081298 |
|    clip_fraction        | 0.556       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.121       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.261      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0615     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -152.0
Standard deviation of reward: 0.0
Average successful assignments: 84.23076923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 26          |
|    time_elapsed         | 299         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.021006785 |
|    clip_fraction        | 0.402       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.26        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.777       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0569     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -122.0
Standard deviation of reward: 0.0
Average successful assignments: 95.0906862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 34          |
|    time_elapsed         | 394         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018619042 |
|    clip_fraction        | 0.368       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.327       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.425       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0585     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 103.20039682539682
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 490         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.020696027 |
|    clip_fraction        | 0.408       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.354       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.659       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0637     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 111.54
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 50          |
|    time_elapsed         | 585         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.019068733 |
|    clip_fraction        | 0.33        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.39        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.712       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 2.81        |


-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 118.00431034482759
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -368       |
| time/                   |            |
|    fps                  | 87         |
|    iterations           | 58         |
|    time_elapsed         | 679        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01869753 |
|    clip_fraction        | 0.352      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.423      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.114     |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0638    |
|    value_loss           | 2.64       |
---------

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 124.66666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 776         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018120358 |
|    clip_fraction        | 0.342       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.809       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0596     |
|    value_loss           | 2.64

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 130.9572072072072
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -366       |
| time/                   |            |
|    fps                  | 87         |
|    iterations           | 74         |
|    time_elapsed         | 864        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01988304 |
|    clip_fraction        | 0.419      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.406      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.33       |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0677    |
|    value_loss           | 2.78       |
----------

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 136.03048780487805
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 949         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019626731 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.94        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0643     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 140.63611111111112
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1034        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.021874148 |
|    clip_fraction        | 0.437       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.381       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0211     |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0695     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 146.0518707482993
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 98          |
|    time_elapsed         | 1131        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.021638632 |
|    clip_fraction        | 0.421       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.407       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.995       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0671     |
|    value_loss           | 2.5

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -360.0
Standard deviation of reward: 0.0
Average successful assignments: 19.083333333333332
All assignments history: [16, 13, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 106      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -258.0
Standard deviation of reward: 0.0
Average successful assignments: 40.541666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007813905 |
|    clip_fraction        | 0.0507      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.275      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.56        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -234.0
Standard deviation of reward: 0.0
Average successful assignments: 83.75833333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 10          |
|    time_elapsed         | 118         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020043887 |
|    clip_fraction        | 0.436       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.00309     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.775       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0643     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -232.0
Standard deviation of reward: 0.0
Average successful assignments: 86.75925925925925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 18          |
|    time_elapsed         | 212         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025298249 |
|    clip_fraction        | 0.548       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0896      |
|    learning_rate        | 0.00018     |
|    loss                 | -0.333      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0634     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -132.0
Standard deviation of reward: 0.0
Average successful assignments: 95.96794871794872
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 26          |
|    time_elapsed         | 305         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.023503944 |
|    clip_fraction        | 0.48        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.185       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.802       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0654     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -146.0
Standard deviation of reward: 0.0
Average successful assignments: 98.36274509803921
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 34          |
|    time_elapsed         | 397         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019979134 |
|    clip_fraction        | 0.392       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.299       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.699       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0622     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -198.0
Standard deviation of reward: 0.0
Average successful assignments: 98.17063492063492
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 490         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.018747851 |
|    clip_fraction        | 0.352       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0199     |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0625     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -174.0
Standard deviation of reward: 0.0
Average successful assignments: 98.43166666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -369       |
| time/                   |            |
|    fps                  | 87         |
|    iterations           | 50         |
|    time_elapsed         | 582        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01838402 |
|    clip_fraction        | 0.321      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.468      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.0634     |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0608    |
|    value_loss           | 2.52       |
--------

-------- Rollout Summary --------
Total mean reward: -68.0
Standard deviation of reward: 0.0
Average successful assignments: 102.4066091954023
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -368      |
| time/                   |           |
|    fps                  | 88        |
|    iterations           | 58        |
|    time_elapsed         | 673       |
|    total_timesteps      | 59392     |
| train/                  |           |
|    approx_kl            | 0.0169718 |
|    clip_fraction        | 0.323     |
|    clip_range           | 0.15      |
|    entropy_loss         | -8.22     |
|    explained_variance   | 0.432     |
|    learning_rate        | 0.00018   |
|    loss                 | 1.09      |
|    n_updates            | 570       |
|    policy_gradient_loss | -0.0617   |
|    value_loss           | 2.64      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 109.71464646464646
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 66          |
|    time_elapsed         | 761         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017409585 |
|    clip_fraction        | 0.316       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.48        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.206       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0601     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 118.59346846846847
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 74          |
|    time_elapsed         | 846         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018107964 |
|    clip_fraction        | 0.344       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.531       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.71        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0655     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 127.60162601626017
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 82          |
|    time_elapsed         | 927         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019815238 |
|    clip_fraction        | 0.405       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.09       |
|    explained_variance   | 0.549       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.134      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0727     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 130.0
Standard deviation of reward: 0.0
Average successful assignments: 136.28148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -355        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 90          |
|    time_elapsed         | 1006        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018984305 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.93       |
|    explained_variance   | 0.502       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0757     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 144.13095238095238
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 98          |
|    time_elapsed         | 1087        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019137414 |
|    clip_fraction        | 0.405       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.62       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.452       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0778     |
|    value_loss           | 2.

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.083333333333333
All assignments history: [13, 12, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -375     |
| time/              |          |
|    fps             | 126      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -342.0
Standard deviation of reward: 0.0
Average successful assignments: 16.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 2           |
|    time_elapsed         | 18          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008803466 |
|    clip_fraction        | 0.0666      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.0821     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.12        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0502     |
|    value_loss           | 15.8        |


-------- Rollout Summary --------
Total mean reward: -336.0
Standard deviation of reward: 0.0
Average successful assignments: 35.025
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -373       |
| time/                   |            |
|    fps                  | 103        |
|    iterations           | 10         |
|    time_elapsed         | 98         |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01996264 |
|    clip_fraction        | 0.43       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.00655    |
|    learning_rate        | 0.00018    |
|    loss                 | 0.432      |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0616    |
|    value_loss           | 4.28       |
-------------------

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 49.14351851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 102         |
|    iterations           | 18          |
|    time_elapsed         | 179         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.027682107 |
|    clip_fraction        | 0.57        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.109       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.371       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0652     |
|    value_loss           | 2.84 

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 81.73717948717949
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 26          |
|    time_elapsed         | 255         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.019053705 |
|    clip_fraction        | 0.358       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.255       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.244      |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0593     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: -48.0
Standard deviation of reward: 0.0
Average successful assignments: 95.30392156862744
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 105         |
|    iterations           | 34          |
|    time_elapsed         | 330         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.022276212 |
|    clip_fraction        | 0.417       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.328       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.76        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.062      |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 104.82142857142857
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -371       |
| time/                   |            |
|    fps                  | 107        |
|    iterations           | 42         |
|    time_elapsed         | 401        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.02366568 |
|    clip_fraction        | 0.434      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.317      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.243      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0603    |
|    value_loss           | 2.98       |
--------

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 114.085
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 50          |
|    time_elapsed         | 472         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.019436086 |
|    clip_fraction        | 0.346       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.375       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.136      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0613     |
|    value_loss           | 2.63        |

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 122.48132183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 58          |
|    time_elapsed         | 544         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.018030718 |
|    clip_fraction        | 0.326       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.372       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0851     |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.059      |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 130.94065656565655
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 66          |
|    time_elapsed         | 615         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016188148 |
|    clip_fraction        | 0.284       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.335       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0613     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 138.11036036036037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 74          |
|    time_elapsed         | 687         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.014099255 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.12       |
|    explained_variance   | 0.503       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.161      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0557     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 146.0060975609756
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -358       |
| time/                   |            |
|    fps                  | 110        |
|    iterations           | 82         |
|    time_elapsed         | 761        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01501281 |
|    clip_fraction        | 0.278      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.04      |
|    explained_variance   | 0.522      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.269      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0632    |
|    value_loss           | 2.35       |
---------

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 152.1287037037037
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -351         |
| time/                   |              |
|    fps                  | 110          |
|    iterations           | 90           |
|    time_elapsed         | 831          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0151684545 |
|    clip_fraction        | 0.297        |
|    clip_range           | 0.15         |
|    entropy_loss         | -7.89        |
|    explained_variance   | 0.462        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.677        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0672      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 157.69557823129253
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -340        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 98          |
|    time_elapsed         | 903         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.013616287 |
|    clip_fraction        | 0.284       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.64       |
|    explained_variance   | 0.45        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.677       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0691     |
|    value_loss           | 2.

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 14.333333333333334
All assignments history: [7, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -378     |
| time/              |          |
|    fps             | 130      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -316.0
Standard deviation of reward: 0.0
Average successful assignments: 25.541666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -378        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008195158 |
|    clip_fraction        | 0.0538      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.174      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.19        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0483     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -328.0
Standard deviation of reward: 0.0
Average successful assignments: 41.141666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 10          |
|    time_elapsed         | 89          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020530485 |
|    clip_fraction        | 0.447       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.00434     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.627       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0644     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 62.736111111111114
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 18          |
|    time_elapsed         | 158         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.024578076 |
|    clip_fraction        | 0.531       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.181       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.353      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0638     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: -108.0
Standard deviation of reward: 0.0
Average successful assignments: 79.25320512820512
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 26          |
|    time_elapsed         | 232         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.022220582 |
|    clip_fraction        | 0.48        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.393       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0722     |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0656     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -104.0
Standard deviation of reward: 0.0
Average successful assignments: 88.78921568627452
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 34          |
|    time_elapsed         | 322         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019603558 |
|    clip_fraction        | 0.345       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.93        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0588     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 99.98412698412699
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -371       |
| time/                   |            |
|    fps                  | 110        |
|    iterations           | 42         |
|    time_elapsed         | 389        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01896058 |
|    clip_fraction        | 0.357      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.506      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.11       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.061     |
|    value_loss           | 2.27       |
---------

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 109.77333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -370       |
| time/                   |            |
|    fps                  | 110        |
|    iterations           | 50         |
|    time_elapsed         | 462        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01780366 |
|    clip_fraction        | 0.338      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.575      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.155     |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0629    |
|    value_loss           | 2.15       |
--------

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 116.88218390804597
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 58          |
|    time_elapsed         | 538         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.016377132 |
|    clip_fraction        | 0.311       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.578       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.159       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0629     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 121.83333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 66          |
|    time_elapsed         | 611         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.019166829 |
|    clip_fraction        | 0.378       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.631       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.9         |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0653     |
|    value_loss           | 1.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 125.84909909909909
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -364       |
| time/                   |            |
|    fps                  | 111        |
|    iterations           | 74         |
|    time_elapsed         | 680        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01895722 |
|    clip_fraction        | 0.387      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.19      |
|    explained_variance   | 0.601      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.586      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0701    |
|    value_loss           | 2.2        |
--------

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 129.1798780487805
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -362       |
| time/                   |            |
|    fps                  | 112        |
|    iterations           | 82         |
|    time_elapsed         | 747        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01940947 |
|    clip_fraction        | 0.402      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.16      |
|    explained_variance   | 0.62       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.295      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0729    |
|    value_loss           | 2          |
---------

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 132.96296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 90          |
|    time_elapsed         | 813         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019886479 |
|    clip_fraction        | 0.421       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.08       |
|    explained_variance   | 0.616       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.168       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0758     |
|    value_loss           | 2.0

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 136.74659863945578
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -352        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 98          |
|    time_elapsed         | 880         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020095417 |
|    clip_fraction        | 0.429       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.98       |
|    explained_variance   | 0.63        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.874       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0777     |
|    value_loss           | 2.1

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 5.75
All assignments history: [16, 13, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 149      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -354.0
Standard deviation of reward: 0.0
Average successful assignments: 13.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009275534 |
|    clip_fraction        | 0.079       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.165      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 15.7        |
-

-------- Rollout Summary --------
Total mean reward: -394.0
Standard deviation of reward: 0.0
Average successful assignments: 46.575
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 10          |
|    time_elapsed         | 81          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019742884 |
|    clip_fraction        | 0.427       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.00559     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.652       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 4.29        |

-------- Rollout Summary --------
Total mean reward: -120.0
Standard deviation of reward: 0.0
Average successful assignments: 65.27777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 18          |
|    time_elapsed         | 150         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.028280614 |
|    clip_fraction        | 0.586       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0507      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0617     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -174.0
Standard deviation of reward: 0.0
Average successful assignments: 71.82051282051282
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 26          |
|    time_elapsed         | 220         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.021739308 |
|    clip_fraction        | 0.415       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.305       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.061      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 82.79656862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 34          |
|    time_elapsed         | 286         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.015464096 |
|    clip_fraction        | 0.281       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.358       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -66.0
Standard deviation of reward: 0.0
Average successful assignments: 92.3452380952381
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 42          |
|    time_elapsed         | 352         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017486613 |
|    clip_fraction        | 0.335       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.386       |
|    learning_rate        | 0.00018     |
|    loss                 | 1           |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0596     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 101.44333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 50          |
|    time_elapsed         | 419         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016231805 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0975     |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0609     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 110.92672413793103
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 58          |
|    time_elapsed         | 486         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.016580295 |
|    clip_fraction        | 0.308       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.545       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0497      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0635     |
|    value_loss           | 2.41

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 118.07449494949495
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 66          |
|    time_elapsed         | 552         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016277283 |
|    clip_fraction        | 0.34        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.506       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.36        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0649     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 123.14301801801801
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 74          |
|    time_elapsed         | 617         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.016129669 |
|    clip_fraction        | 0.298       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.578       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.985       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0622     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 127.7571138211382
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 82          |
|    time_elapsed         | 678         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.016703118 |
|    clip_fraction        | 0.33        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.537       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0675     |
|    value_loss           | 2.17 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 132.01203703703703
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 90          |
|    time_elapsed         | 737         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.015602584 |
|    clip_fraction        | 0.292       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.07       |
|    explained_variance   | 0.622       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.155       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0665     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 136.0841836734694
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -353        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 98          |
|    time_elapsed         | 791         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016442172 |
|    clip_fraction        | 0.338       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.96       |
|    explained_variance   | 0.543       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.452       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0712     |
|    value_loss           | 2.26