In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.266666666666667
All assignments history: [6, 7, 5, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -190     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -158.0
Standard deviation of reward: 0.0
Average successful assignments: 10.6
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 142         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008834934 |
|    clip_fraction        | 0.0689      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.225      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.27        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0532     |
|    value_loss           | 16.8        |
-

-------- Rollout Summary --------
Total mean reward: -150.0
Standard deviation of reward: 0.0
Average successful assignments: 15.28
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 10          |
|    time_elapsed         | 85          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020312376 |
|    clip_fraction        | 0.455       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.141       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.59        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0663     |
|    value_loss           | 4.8         |


-------- Rollout Summary --------
Total mean reward: -124.0
Standard deviation of reward: 0.0
Average successful assignments: 17.507407407407406
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 118         |
|    iterations           | 18          |
|    time_elapsed         | 155         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.027482104 |
|    clip_fraction        | 0.576       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.452       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0651     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -118.0
Standard deviation of reward: 0.0
Average successful assignments: 22.253846153846155
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 119        |
|    iterations           | 26         |
|    time_elapsed         | 223        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.01869446 |
|    clip_fraction        | 0.398      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.641      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.628      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0648    |
|    value_loss           | 3.74       |
-------

-------- Rollout Summary --------
Total mean reward: -76.0
Standard deviation of reward: 0.0
Average successful assignments: 26.107843137254903
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 34          |
|    time_elapsed         | 289         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.016986107 |
|    clip_fraction        | 0.339       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.763       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0904     |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0645     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -68.0
Standard deviation of reward: 0.0
Average successful assignments: 30.377777777777776
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 119        |
|    iterations           | 42         |
|    time_elapsed         | 359        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01893273 |
|    clip_fraction        | 0.388      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.811      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.0486    |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0697    |
|    value_loss           | 2.39       |
--------

-------- Rollout Summary --------
Total mean reward: -76.0
Standard deviation of reward: 0.0
Average successful assignments: 32.576
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 50          |
|    time_elapsed         | 425         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.017718416 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.866       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.112      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0706     |
|    value_loss           | 1.93        |


-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 35.795402298850576
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 118         |
|    iterations           | 58          |
|    time_elapsed         | 501         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017994465 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.903       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.148      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0724     |
|    value_loss           | 1.

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 38.412121212121214
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 66          |
|    time_elapsed         | 581         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.019118506 |
|    clip_fraction        | 0.406       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.913       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.00398    |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0792     |
|    value_loss           | 1.

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 42.236936936936935
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 74          |
|    time_elapsed         | 658         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.019951018 |
|    clip_fraction        | 0.406       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.935       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.304      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0782     |
|    value_loss           | 1.0

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 46.1219512195122
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 82          |
|    time_elapsed         | 735         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.021495694 |
|    clip_fraction        | 0.45        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.938       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.321      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0838     |
|    value_loss           | 1    

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 49.434074074074076
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 90          |
|    time_elapsed         | 809         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019470362 |
|    clip_fraction        | 0.408       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.16       |
|    explained_variance   | 0.927       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0142     |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.082      |
|    value_loss           | 1.2

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 52.22244897959184
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 98          |
|    time_elapsed         | 887         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020743836 |
|    clip_fraction        | 0.426       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.12       |
|    explained_variance   | 0.935       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.268      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0837     |
|    value_loss           | 1.01

In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 3.6
All assignments history: [5, 7, 8, 8, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 156      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 10.333333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009220557 |
|    clip_fraction        | 0.0673      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.195      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.04        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0526     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -84.0
Standard deviation of reward: 0.0
Average successful assignments: 28.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 10          |
|    time_elapsed         | 93          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.021149626 |
|    clip_fraction        | 0.479       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0964      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.43        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0656     |
|    value_loss           | 4.95        |
--

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 38.18518518518518
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 18         |
|    time_elapsed         | 169        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02128157 |
|    clip_fraction        | 0.44       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.484      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.972      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0638    |
|    value_loss           | 4.18       |
---------

-------- Rollout Summary --------
Total mean reward: -58.0
Standard deviation of reward: 0.0
Average successful assignments: 41.54615384615385
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 26         |
|    time_elapsed         | 245        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.02152161 |
|    clip_fraction        | 0.449      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.655      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.357      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0695    |
|    value_loss           | 3.65       |
---------

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 44.431372549019606
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 34          |
|    time_elapsed         | 322         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019182649 |
|    clip_fraction        | 0.39        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.714       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0707     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 46.56825396825397
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 42          |
|    time_elapsed         | 395         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.014906834 |
|    clip_fraction        | 0.273       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.764       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0652     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 49.95466666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 50          |
|    time_elapsed         | 475         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015305791 |
|    clip_fraction        | 0.286       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.804       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.272       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0687     |
|    value_loss           | 2.61 

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 53.16896551724138
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 58          |
|    time_elapsed         | 552         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017318651 |
|    clip_fraction        | 0.346       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.836       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.239       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0736     |
|    value_loss           | 2.25

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 55.71313131313131
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 66          |
|    time_elapsed         | 629         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017789854 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.863       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.00323     |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.077      |
|    value_loss           | 1.84 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 57.81531531531532
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 74          |
|    time_elapsed         | 706         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018504765 |
|    clip_fraction        | 0.373       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.888       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.142       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.078      |
|    value_loss           | 1.65

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 59.82032520325203
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 82          |
|    time_elapsed         | 782         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018818716 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.16       |
|    explained_variance   | 0.888       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.311       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0815     |
|    value_loss           | 1.69

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 61.23777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 90          |
|    time_elapsed         | 857         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.021113224 |
|    clip_fraction        | 0.439       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.1        |
|    explained_variance   | 0.911       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.214       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0842     |
|    value_loss           | 1.31

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 62.86598639455782
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 98          |
|    time_elapsed         | 934         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019369101 |
|    clip_fraction        | 0.431       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.98       |
|    explained_variance   | 0.9         |
|    learning_rate        | 0.00018     |
|    loss                 | -0.133      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.085      |
|    value_loss           | 1.36

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 7.266666666666667
All assignments history: [6, 10, 11, 11, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -180     |
| time/              |          |
|    fps             | 144      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -146.0
Standard deviation of reward: 0.0
Average successful assignments: 14.133333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008556066 |
|    clip_fraction        | 0.0545      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | 0.00934     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -128.0
Standard deviation of reward: 0.0
Average successful assignments: 21.433333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 10          |
|    time_elapsed         | 93          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019627456 |
|    clip_fraction        | 0.423       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0668      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.34        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0626     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -86.0
Standard deviation of reward: 0.0
Average successful assignments: 23.94814814814815
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -184       |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 18         |
|    time_elapsed         | 170        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02196468 |
|    clip_fraction        | 0.484      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.462      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.97       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0661    |
|    value_loss           | 4.21       |
---------

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 31.766666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 26          |
|    time_elapsed         | 249         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018124592 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.644       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.407       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.065      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 40.22352941176471
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 106        |
|    iterations           | 34         |
|    time_elapsed         | 328        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01563276 |
|    clip_fraction        | 0.312      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.715      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.39       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0674    |
|    value_loss           | 3.23       |
-----------

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 43.888888888888886
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 42          |
|    time_elapsed         | 405         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.018540595 |
|    clip_fraction        | 0.376       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.796       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0705     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 47.67733333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 50          |
|    time_elapsed         | 481         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016516412 |
|    clip_fraction        | 0.332       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.87        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.122       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0712     |
|    value_loss           | 1.87

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 49.73103448275862
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 58          |
|    time_elapsed         | 549         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.019360548 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.904       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.141      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0784     |
|    value_loss           | 1.4

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 51.65858585858586
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 66          |
|    time_elapsed         | 614         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018525463 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.923       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0252     |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0793     |
|    value_loss           | 1.27 

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 53.57297297297297
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 74          |
|    time_elapsed         | 676         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017975684 |
|    clip_fraction        | 0.369       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.937       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.134      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0777     |
|    value_loss           | 1.07

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 55.55365853658537
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 82          |
|    time_elapsed         | 738         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019846207 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.942       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0369     |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0811     |
|    value_loss           | 1.03

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 57.09407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 90          |
|    time_elapsed         | 798         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019357506 |
|    clip_fraction        | 0.411       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.945       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0679     |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0851     |
|    value_loss           | 1.07

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 58.220408163265304
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 98          |
|    time_elapsed         | 858         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020734066 |
|    clip_fraction        | 0.455       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.14       |
|    explained_variance   | 0.943       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0902     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0874     |
|    value_loss           | 0.98

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 9.933333333333334
All assignments history: [7, 8, 11, 13, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -180     |
| time/              |          |
|    fps             | 197      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -150.0
Standard deviation of reward: 0.0
Average successful assignments: 14.666666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 161         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009309849 |
|    clip_fraction        | 0.0719      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.0769     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.07        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0545     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 22.72
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 143        |
|    iterations           | 10         |
|    time_elapsed         | 71         |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01987244 |
|    clip_fraction        | 0.444      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.0773     |
|    learning_rate        | 0.00018    |
|    loss                 | 0.64       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0629    |
|    value_loss           | 5.06       |
--------------------

-------- Rollout Summary --------
Total mean reward: -102.0
Standard deviation of reward: 0.0
Average successful assignments: 21.01851851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 142         |
|    iterations           | 18          |
|    time_elapsed         | 129         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.024129447 |
|    clip_fraction        | 0.524       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.341       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.63        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0627     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -100.0
Standard deviation of reward: 0.0
Average successful assignments: 29.074358974358976
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 142         |
|    iterations           | 26          |
|    time_elapsed         | 187         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.016812038 |
|    clip_fraction        | 0.33        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.531       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.06       |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 35.009803921568626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 141         |
|    iterations           | 34          |
|    time_elapsed         | 245         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.017185014 |
|    clip_fraction        | 0.339       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.657       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.49        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0645     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 38.35079365079365
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 140         |
|    iterations           | 42          |
|    time_elapsed         | 305         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019159442 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.737       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.81        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0668     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 41.64666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 139         |
|    iterations           | 50          |
|    time_elapsed         | 368         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.017672587 |
|    clip_fraction        | 0.367       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.784       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.37        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0689     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: -46.0
Standard deviation of reward: 0.0
Average successful assignments: 43.237931034482756
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 138         |
|    iterations           | 58          |
|    time_elapsed         | 429         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017137963 |
|    clip_fraction        | 0.331       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.824       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.567       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 45.62323232323232
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -179       |
| time/                   |            |
|    fps                  | 137        |
|    iterations           | 66         |
|    time_elapsed         | 492        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01496763 |
|    clip_fraction        | 0.29       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.22      |
|    explained_variance   | 0.853      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.269      |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0715    |
|    value_loss           | 2.1        |
----------

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 47.35405405405405
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 74          |
|    time_elapsed         | 555         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018009314 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.897       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0606      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0775     |
|    value_loss           | 1.6

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 49.917886178861785
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 82          |
|    time_elapsed         | 616         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019636707 |
|    clip_fraction        | 0.421       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.911       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0395      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0829     |
|    value_loss           | 1.3

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 52.55407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 90          |
|    time_elapsed         | 676         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018767398 |
|    clip_fraction        | 0.381       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.16       |
|    explained_variance   | 0.94        |
|    learning_rate        | 0.00018     |
|    loss                 | -0.243      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0788     |
|    value_loss           | 0.98

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 55.03265306122449
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -173       |
| time/                   |            |
|    fps                  | 135        |
|    iterations           | 98         |
|    time_elapsed         | 738        |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.01750645 |
|    clip_fraction        | 0.38       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.09      |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.0704     |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0826    |
|    value_loss           | 1.32       |
----------

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 4.066666666666666
All assignments history: [5, 4, 8, 13, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 173      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -172.0
Standard deviation of reward: 0.0
Average successful assignments: 8.333333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 153          |
|    iterations           | 2            |
|    time_elapsed         | 13           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0101311095 |
|    clip_fraction        | 0.0929       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | -0.418       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.55         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0567      |
|    value_

-------- Rollout Summary --------
Total mean reward: -98.0
Standard deviation of reward: 0.0
Average successful assignments: 14.826666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 139         |
|    iterations           | 10          |
|    time_elapsed         | 73          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020809796 |
|    clip_fraction        | 0.46        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.129       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.85        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0654     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 28.114814814814814
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 18          |
|    time_elapsed         | 134         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.021822298 |
|    clip_fraction        | 0.486       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0697     |
|    value_loss           | 3.92

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 39.56410256410256
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 136        |
|    iterations           | 26         |
|    time_elapsed         | 195        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.01568822 |
|    clip_fraction        | 0.314      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.726      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.846      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0676    |
|    value_loss           | 3.15       |
----------

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 45.76470588235294
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 34          |
|    time_elapsed         | 257         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.017289814 |
|    clip_fraction        | 0.356       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.814       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.624       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.072      |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 50.05873015873016
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 42          |
|    time_elapsed         | 318         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.018672038 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.87        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.207       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.074      |
|    value_loss           | 1.9

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 52.59466666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 50          |
|    time_elapsed         | 379         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.019346472 |
|    clip_fraction        | 0.389       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.903       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0507      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0779     |
|    value_loss           | 1.5

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 55.6551724137931
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 58          |
|    time_elapsed         | 440         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.020232543 |
|    clip_fraction        | 0.43        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.909       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0374     |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0841     |
|    value_loss           | 1.49 

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 58.23434343434344
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 66          |
|    time_elapsed         | 501         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.020345218 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.932       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0536     |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0827     |
|    value_loss           | 1.18

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 60.436036036036036
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -176       |
| time/                   |            |
|    fps                  | 134        |
|    iterations           | 74         |
|    time_elapsed         | 563        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.02104899 |
|    clip_fraction        | 0.429      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.2       |
|    explained_variance   | 0.932      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.0239     |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0824    |
|    value_loss           | 1.12       |
---------

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 61.954471544715446
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -175       |
| time/                   |            |
|    fps                  | 134        |
|    iterations           | 82         |
|    time_elapsed         | 624        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01998797 |
|    clip_fraction        | 0.427      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.16      |
|    explained_variance   | 0.94       |
|    learning_rate        | 0.00018    |
|    loss                 | -0.204     |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0845    |
|    value_loss           | 0.99       |
----------

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 63.11185185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 90          |
|    time_elapsed         | 687         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019513289 |
|    clip_fraction        | 0.422       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.1        |
|    explained_variance   | 0.928       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.221      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0834     |
|    value_loss           | 1.2 

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 64.46190476190476
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -166        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 98          |
|    time_elapsed         | 748         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016803917 |
|    clip_fraction        | 0.363       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.95       |
|    explained_variance   | 0.909       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0308     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0817     |
|    value_loss           | 1.41

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 4.333333333333333
All assignments history: [8, 7, 7, 8, 15, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 204      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -144.0
Standard deviation of reward: 0.0
Average successful assignments: 12.933333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 158         |
|    iterations           | 2           |
|    time_elapsed         | 12          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009389493 |
|    clip_fraction        | 0.0727      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.112      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -104.0
Standard deviation of reward: 0.0
Average successful assignments: 21.153333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 142         |
|    iterations           | 10          |
|    time_elapsed         | 71          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019642206 |
|    clip_fraction        | 0.432       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.137       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.064      |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -134.0
Standard deviation of reward: 0.0
Average successful assignments: 27.062962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 139         |
|    iterations           | 18          |
|    time_elapsed         | 131         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.024680093 |
|    clip_fraction        | 0.535       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.51        |
|    learning_rate        | 0.00018     |
|    loss                 | 2.01        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0706     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 26.153846153846153
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 26          |
|    time_elapsed         | 195         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.019157592 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.679       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.421       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0732     |
|    value_loss           | 3

-------- Rollout Summary --------
Total mean reward: -130.0
Standard deviation of reward: 0.0
Average successful assignments: 25.268627450980393
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 34          |
|    time_elapsed         | 260         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.017378412 |
|    clip_fraction        | 0.353       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.734       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.613       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0713     |
|    value_loss           | 3

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 28.08888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 42          |
|    time_elapsed         | 325         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.016391646 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.789       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.447       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0694     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 32.714666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 50          |
|    time_elapsed         | 386         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.014462565 |
|    clip_fraction        | 0.266       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.84        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.176       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0675     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 36.05287356321839
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 131         |
|    iterations           | 58          |
|    time_elapsed         | 450         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015111204 |
|    clip_fraction        | 0.28        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.861       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.177       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0695     |
|    value_loss           | 1.9

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 38.845454545454544
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 130         |
|    iterations           | 66          |
|    time_elapsed         | 516         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.014944779 |
|    clip_fraction        | 0.285       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.873       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.605       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0719     |
|    value_loss           | 1.

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 41.55225225225225
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -176       |
| time/                   |            |
|    fps                  | 130        |
|    iterations           | 74         |
|    time_elapsed         | 582        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01605677 |
|    clip_fraction        | 0.314      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.18      |
|    explained_variance   | 0.884      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.0157     |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0749    |
|    value_loss           | 1.82       |
---------

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 43.9869918699187
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 82          |
|    time_elapsed         | 647         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.015569462 |
|    clip_fraction        | 0.302       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.14       |
|    explained_variance   | 0.909       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0576     |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0734     |
|    value_loss           | 1.39  

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 46.73037037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 90          |
|    time_elapsed         | 713         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018787315 |
|    clip_fraction        | 0.397       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.09       |
|    explained_variance   | 0.908       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.229       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0812     |
|    value_loss           | 1.35 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 49.39523809523809
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 98          |
|    time_elapsed         | 778         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016564235 |
|    clip_fraction        | 0.359       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.96       |
|    explained_variance   | 0.902       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0282      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0775     |
|    value_loss           | 1.49

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -170.0
Standard deviation of reward: 0.0
Average successful assignments: 12.133333333333333
All assignments history: [9, 5, 6, 6, 6, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 162      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -128.0
Standard deviation of reward: 0.0
Average successful assignments: 19.266666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -186         |
| time/                   |              |
|    fps                  | 136          |
|    iterations           | 2            |
|    time_elapsed         | 14           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0090663005 |
|    clip_fraction        | 0.0677       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | 0.0309       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.1          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0513      |
|    value

-------- Rollout Summary --------
Total mean reward: -62.0
Standard deviation of reward: 0.0
Average successful assignments: 21.033333333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 10          |
|    time_elapsed         | 80          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020919636 |
|    clip_fraction        | 0.461       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0725      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0646     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 29.874074074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 18          |
|    time_elapsed         | 145         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.026191123 |
|    clip_fraction        | 0.535       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.445       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.16        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0684     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -94.0
Standard deviation of reward: 0.0
Average successful assignments: 34.33076923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 26          |
|    time_elapsed         | 211         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018982524 |
|    clip_fraction        | 0.404       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.619       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.3         |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0681     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 38.66470588235294
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 34          |
|    time_elapsed         | 277         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018285021 |
|    clip_fraction        | 0.338       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.717       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0654     |
|    value_loss           | 3.27 

-------- Rollout Summary --------
Total mean reward: -58.0
Standard deviation of reward: 0.0
Average successful assignments: 43.03968253968254
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 42          |
|    time_elapsed         | 341         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017111156 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.797       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.446       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.071      |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -32.0
Standard deviation of reward: 0.0
Average successful assignments: 44.49333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 50          |
|    time_elapsed         | 408         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.018296938 |
|    clip_fraction        | 0.362       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.841       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.438       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0736     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 47.9
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 58          |
|    time_elapsed         | 471         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017493518 |
|    clip_fraction        | 0.35        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.86        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0153      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0748     |
|    value_loss           | 1.97        |
---

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 51.87676767676768
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 66          |
|    time_elapsed         | 532         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017020352 |
|    clip_fraction        | 0.341       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.869       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.249       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0761     |
|    value_loss           | 2.01

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 55.01441441441442
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 74          |
|    time_elapsed         | 599         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018854966 |
|    clip_fraction        | 0.39        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.906       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.165      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0788     |
|    value_loss           | 1.41

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 57.51951219512195
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -176       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 82         |
|    time_elapsed         | 665        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01772343 |
|    clip_fraction        | 0.367      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.19      |
|    explained_variance   | 0.91       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.0366     |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0783    |
|    value_loss           | 1.38       |
----------

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 59.337037037037035
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 90          |
|    time_elapsed         | 732         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018318376 |
|    clip_fraction        | 0.378       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.14       |
|    explained_variance   | 0.905       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0979     |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0813     |
|    value_loss           | 1.4

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 61.010884353741496
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -170        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 98          |
|    time_elapsed         | 798         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020218978 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.09       |
|    explained_variance   | 0.9         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.121       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0811     |
|    value_loss           | 1.5

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.2
All assignments history: [7, 13, 4, 5, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 153      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -144.0
Standard deviation of reward: 0.0
Average successful assignments: 14.033333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 141         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009492032 |
|    clip_fraction        | 0.0843      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.317      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 38.22666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 10          |
|    time_elapsed         | 81          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019528527 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0522      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.87        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0599     |
|    value_loss           | 5.6

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 42.34814814814815
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 124        |
|    iterations           | 18         |
|    time_elapsed         | 147        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02215746 |
|    clip_fraction        | 0.467      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.456      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.507      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.066     |
|    value_loss           | 4.43       |
---------

-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 44.815384615384616
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 26          |
|    time_elapsed         | 213         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.021502724 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.61        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.528       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0683     |
|    value_loss           | 4 

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 45.57843137254902
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 34          |
|    time_elapsed         | 279         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019135483 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.675       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.174       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0679     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 46.67777777777778
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 123        |
|    iterations           | 42         |
|    time_elapsed         | 348        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01911948 |
|    clip_fraction        | 0.365      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.25      |
|    explained_variance   | 0.714      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.992      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0648    |
|    value_loss           | 3.6        |
---------

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 47.39333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 50          |
|    time_elapsed         | 413         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.018283144 |
|    clip_fraction        | 0.369       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.731       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0671     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 48.701149425287355
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 58          |
|    time_elapsed         | 474         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.019909287 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.733       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.103       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0683     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -46.0
Standard deviation of reward: 0.0
Average successful assignments: 49.60808080808081
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 66          |
|    time_elapsed         | 533         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018202009 |
|    clip_fraction        | 0.358       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.753       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.00812    |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0668     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: -64.0
Standard deviation of reward: 0.0
Average successful assignments: 49.75855855855856
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 74          |
|    time_elapsed         | 593         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.020066135 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.76        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.292       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0664     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 50.06260162601626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 82          |
|    time_elapsed         | 652         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018756527 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.771       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.48        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0705     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 51.245925925925924
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -175       |
| time/                   |            |
|    fps                  | 128        |
|    iterations           | 90         |
|    time_elapsed         | 718        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.01980574 |
|    clip_fraction        | 0.401      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.798      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.478      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.07      |
|    value_loss           | 2.77       |
--------

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 52.42721088435374
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 98          |
|    time_elapsed         | 784         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017254286 |
|    clip_fraction        | 0.348       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.861       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.00197     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0748     |
|    value_loss           | 1.9

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -184.0
Standard deviation of reward: 0.0
Average successful assignments: 7.666666666666667
All assignments history: [3, 4, 10, 7, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 163      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -154.0
Standard deviation of reward: 0.0
Average successful assignments: 12.566666666666666
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -187       |
| time/                   |            |
|    fps                  | 138        |
|    iterations           | 2          |
|    time_elapsed         | 14         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00853355 |
|    clip_fraction        | 0.0588     |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.29      |
|    explained_variance   | -0.26      |
|    learning_rate        | 0.00018    |
|    loss                 | 2.39       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0522    |
|    value_loss           | 17         |
-------

-------- Rollout Summary --------
Total mean reward: -134.0
Standard deviation of reward: 0.0
Average successful assignments: 19.666666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 10          |
|    time_elapsed         | 82          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019335331 |
|    clip_fraction        | 0.418       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.134       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.757       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0607     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 16.522222222222222
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 18          |
|    time_elapsed         | 147         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025391018 |
|    clip_fraction        | 0.542       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.559       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.57        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0714     |
|    value_loss           | 3

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 25.01025641025641
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 26          |
|    time_elapsed         | 212         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020771975 |
|    clip_fraction        | 0.414       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.655       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0682     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: -80.0
Standard deviation of reward: 0.0
Average successful assignments: 28.641176470588235
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 125        |
|    iterations           | 34         |
|    time_elapsed         | 277        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01513356 |
|    clip_fraction        | 0.304      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.738      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.733      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0643    |
|    value_loss           | 3.3        |
--------

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 32.79047619047619
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 42          |
|    time_elapsed         | 342         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.018730354 |
|    clip_fraction        | 0.369       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.821       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.63        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0731     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 36.26533333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 50          |
|    time_elapsed         | 407         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.018885985 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.864       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.444       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0782     |
|    value_loss           | 1.9

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 39.71264367816092
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -180      |
| time/                   |           |
|    fps                  | 125       |
|    iterations           | 58        |
|    time_elapsed         | 471       |
|    total_timesteps      | 59392     |
| train/                  |           |
|    approx_kl            | 0.0196969 |
|    clip_fraction        | 0.405     |
|    clip_range           | 0.15      |
|    entropy_loss         | -8.24     |
|    explained_variance   | 0.896     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.169     |
|    n_updates            | 570       |
|    policy_gradient_loss | -0.0784   |
|    value_loss           | 1.72      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 43.10808080808081
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -179       |
| time/                   |            |
|    fps                  | 125        |
|    iterations           | 66         |
|    time_elapsed         | 537        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01801666 |
|    clip_fraction        | 0.373      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.91       |
|    learning_rate        | 0.00018    |
|    loss                 | -0.173     |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0777    |
|    value_loss           | 1.58       |
---------

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 46.32612612612613
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -179       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 74         |
|    time_elapsed         | 600        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01866072 |
|    clip_fraction        | 0.38       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.919      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.00273    |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0782    |
|    value_loss           | 1.4        |
-----------

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 49.12357723577236
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 82          |
|    time_elapsed         | 664         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.020076953 |
|    clip_fraction        | 0.426       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.932       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0698     |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0814     |
|    value_loss           | 1.2 

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 52.007407407407406
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 90          |
|    time_elapsed         | 726         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019761331 |
|    clip_fraction        | 0.404       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.929       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.00985     |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0809     |
|    value_loss           | 1.3

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 54.73401360544218
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 98          |
|    time_elapsed         | 787         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.021395272 |
|    clip_fraction        | 0.431       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.932       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0488     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0832     |
|    value_loss           | 1.11

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.0
All assignments history: [9, 7, 3, 6, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 497      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -172.0
Standard deviation of reward: 0.0
Average successful assignments: 8.033333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -186         |
| time/                   |              |
|    fps                  | 420          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0080239875 |
|    clip_fraction        | 0.0525       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | -0.167       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.28         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0481      |
|    value_

-------- Rollout Summary --------
Total mean reward: -88.0
Standard deviation of reward: 0.0
Average successful assignments: 16.866666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 374         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020075016 |
|    clip_fraction        | 0.452       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0798      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.68        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0633     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 21.703703703703702
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 18          |
|    time_elapsed         | 49          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025853176 |
|    clip_fraction        | 0.562       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.547       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.5         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0717     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -114.0
Standard deviation of reward: 0.0
Average successful assignments: 24.48205128205128
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 26          |
|    time_elapsed         | 72          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.019362355 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.655       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.681       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0648     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -98.0
Standard deviation of reward: 0.0
Average successful assignments: 25.788235294117648
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 34          |
|    time_elapsed         | 94          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.020803621 |
|    clip_fraction        | 0.419       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.692       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0713     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -86.0
Standard deviation of reward: 0.0
Average successful assignments: 27.5968253968254
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -183      |
| time/                   |           |
|    fps                  | 367       |
|    iterations           | 42        |
|    time_elapsed         | 117       |
|    total_timesteps      | 43008     |
| train/                  |           |
|    approx_kl            | 0.0184464 |
|    clip_fraction        | 0.365     |
|    clip_range           | 0.15      |
|    entropy_loss         | -8.25     |
|    explained_variance   | 0.752     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.366     |
|    n_updates            | 410       |
|    policy_gradient_loss | -0.0696   |
|    value_loss           | 3.1       |
------------------------------

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 30.214666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 50          |
|    time_elapsed         | 139         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.019726617 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.79        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.873       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0728     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 34.14252873563218
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 366        |
|    iterations           | 58         |
|    time_elapsed         | 162        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01832936 |
|    clip_fraction        | 0.344      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.795      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.172      |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0694    |
|    value_loss           | 3.04       |
---------

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 37.34747474747475
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 365        |
|    iterations           | 66         |
|    time_elapsed         | 184        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.02031543 |
|    clip_fraction        | 0.382      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.82       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.17       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0715    |
|    value_loss           | 2.76       |
---------

-------- Rollout Summary --------
Total mean reward: -32.0
Standard deviation of reward: 0.0
Average successful assignments: 40.296396396396396
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -180       |
| time/                   |            |
|    fps                  | 366        |
|    iterations           | 74         |
|    time_elapsed         | 206        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01729567 |
|    clip_fraction        | 0.336      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.22      |
|    explained_variance   | 0.836      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.218      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0719    |
|    value_loss           | 2.52       |
--------

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 42.8089430894309
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 82          |
|    time_elapsed         | 229         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.017162599 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.838       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.666       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0713     |
|    value_loss           | 2.42  

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 46.080740740740744
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 90          |
|    time_elapsed         | 251         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.017785287 |
|    clip_fraction        | 0.367       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.849       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.076      |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 49.02925170068027
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 365         |
|    iterations           | 98          |
|    time_elapsed         | 274         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.015705856 |
|    clip_fraction        | 0.289       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.876       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.458       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0678     |
|    value_loss           | 1.95