In [2]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.533333333333333
All assignments history: [8, 6, 6, 7, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 365      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 10.333333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 301         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009608667 |
|    clip_fraction        | 0.076       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.155      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.67        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -160.0
Standard deviation of reward: 0.0
Average successful assignments: 15.206666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019303534 |
|    clip_fraction        | 0.413       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0864      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0616     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -78.0
Standard deviation of reward: 0.0
Average successful assignments: 18.677777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023387525 |
|    clip_fraction        | 0.496       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.503       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0673     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 32.82820512820513
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 235        |
|    iterations           | 26         |
|    time_elapsed         | 113        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.02062812 |
|    clip_fraction        | 0.44       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.684      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.163      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0699    |
|    value_loss           | 3.45       |
----------

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 43.17058823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 215         |
|    iterations           | 34          |
|    time_elapsed         | 161         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.016575957 |
|    clip_fraction        | 0.317       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.738       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.145       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0664     |
|    value_loss           | 3.12 

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 47.287301587301585
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 197        |
|    iterations           | 42         |
|    time_elapsed         | 217        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01573599 |
|    clip_fraction        | 0.296      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.795      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.03       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0662    |
|    value_loss           | 2.82       |
--------

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 51.462666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 183         |
|    iterations           | 50          |
|    time_elapsed         | 279         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.017636675 |
|    clip_fraction        | 0.352       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.812       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.81        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0705     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 55.34022988505747
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -180       |
| time/                   |            |
|    fps                  | 173        |
|    iterations           | 58         |
|    time_elapsed         | 341        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01876794 |
|    clip_fraction        | 0.393      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.22      |
|    explained_variance   | 0.834      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.389      |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0717    |
|    value_loss           | 2.4        |
----------

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 58.491919191919195
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 167         |
|    iterations           | 66          |
|    time_elapsed         | 404         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018844906 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.83        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.626       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0717     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 61.69009009009009
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 161         |
|    iterations           | 74          |
|    time_elapsed         | 467         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017072093 |
|    clip_fraction        | 0.345       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.832       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.449       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0671     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 63.798373983739836
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -173       |
| time/                   |            |
|    fps                  | 157        |
|    iterations           | 82         |
|    time_elapsed         | 533        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.02146716 |
|    clip_fraction        | 0.446      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.15      |
|    explained_variance   | 0.836      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.112      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0739    |
|    value_loss           | 2.13       |
---------

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 65.66740740740741
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -171       |
| time/                   |            |
|    fps                  | 153        |
|    iterations           | 90         |
|    time_elapsed         | 600        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.02005762 |
|    clip_fraction        | 0.416      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.08      |
|    explained_variance   | 0.827      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.13       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0725    |
|    value_loss           | 2.26       |
----------

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 67.32993197278911
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 150         |
|    iterations           | 98          |
|    time_elapsed         | 666         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020322692 |
|    clip_fraction        | 0.443       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.95       |
|    explained_variance   | 0.8         |
|    learning_rate        | 0.00018     |
|    loss                 | -0.109      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0764     |
|    value_loss           | 2.43

In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 5.4
All assignments history: [4, 9, 9, 8, 11, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 169      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -94.0
Standard deviation of reward: 0.0
Average successful assignments: 21.6
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 149         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009752127 |
|    clip_fraction        | 0.0771      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.118      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.3         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0563     |
|    value_loss           | 16.5        |
--

-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 26.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 10          |
|    time_elapsed         | 81          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019670688 |
|    clip_fraction        | 0.429       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.134       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.453       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0622     |
|    value_loss           | 5.1         |
--

-------- Rollout Summary --------
Total mean reward: -122.0
Standard deviation of reward: 0.0
Average successful assignments: 30.02962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 18          |
|    time_elapsed         | 153         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023381446 |
|    clip_fraction        | 0.506       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.468       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0691     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 32.64102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 26          |
|    time_elapsed         | 218         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018109586 |
|    clip_fraction        | 0.347       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.665       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.43        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.066      |
|    value_loss           | 3.5

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 39.35686274509804
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 122        |
|    iterations           | 34         |
|    time_elapsed         | 284        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01924657 |
|    clip_fraction        | 0.382      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.779      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.994      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0671    |
|    value_loss           | 2.84       |
---------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 47.13968253968254
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 42          |
|    time_elapsed         | 349         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019152729 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.825       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.157       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0708     |
|    value_loss           | 2.5 

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 53.08133333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 122        |
|    iterations           | 50         |
|    time_elapsed         | 418        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01865163 |
|    clip_fraction        | 0.372      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.856      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.158      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.073     |
|    value_loss           | 2.07       |
----------

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 57.644827586206894
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 58          |
|    time_elapsed         | 486         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.019321296 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.889       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.21        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0758     |
|    value_loss           | 1.8

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 60.74747474747475
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 66          |
|    time_elapsed         | 566         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.019250277 |
|    clip_fraction        | 0.4         |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.907       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0171     |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0751     |
|    value_loss           | 1.52

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 63.15675675675676
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 117         |
|    iterations           | 74          |
|    time_elapsed         | 643         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.021273522 |
|    clip_fraction        | 0.423       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.916       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0772      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0791     |
|    value_loss           | 1.42

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 65.4130081300813
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 82          |
|    time_elapsed         | 720         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.020487696 |
|    clip_fraction        | 0.417       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.917       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0376      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0819     |
|    value_loss           | 1.35 

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 67.44
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 90          |
|    time_elapsed         | 795         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.022436278 |
|    clip_fraction        | 0.475       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.937       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.126      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0876     |
|    value_loss           | 0.957       |
--

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 69.37278911564626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 98          |
|    time_elapsed         | 870         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.025265118 |
|    clip_fraction        | 0.5         |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.08       |
|    explained_variance   | 0.916       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0397     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0878     |
|    value_loss           | 1.18

In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 14.4
All assignments history: [8, 9, 8, 13, 8, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 152      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 15.966666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009628169 |
|    clip_fraction        | 0.0854      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | 0.123       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.25        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -150.0
Standard deviation of reward: 0.0
Average successful assignments: 26.606666666666666
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 107        |
|    iterations           | 10         |
|    time_elapsed         | 94         |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01989601 |
|    clip_fraction        | 0.436      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.151      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.68       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0603    |
|    value_loss           | 5.22       |
-------

-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 19.344444444444445
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 18          |
|    time_elapsed         | 169         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023919597 |
|    clip_fraction        | 0.521       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.459       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0662     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 19.81282051282051
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 26          |
|    time_elapsed         | 245         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.017142924 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.603       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.308       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0655     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 26.33921568627451
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 34          |
|    time_elapsed         | 319         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018358361 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.65        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.75        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0663     |
|    value_loss           | 3.86

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 32.84761904761905
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 42          |
|    time_elapsed         | 394         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017131938 |
|    clip_fraction        | 0.335       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.698       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0668     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 38.73733333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 50          |
|    time_elapsed         | 469         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016877586 |
|    clip_fraction        | 0.32        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.772       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0655     |
|    value_loss           | 3.02

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 44.225287356321836
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -180      |
| time/                   |           |
|    fps                  | 108       |
|    iterations           | 58        |
|    time_elapsed         | 548       |
|    total_timesteps      | 59392     |
| train/                  |           |
|    approx_kl            | 0.0182902 |
|    clip_fraction        | 0.378     |
|    clip_range           | 0.15      |
|    entropy_loss         | -8.24     |
|    explained_variance   | 0.814     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.107     |
|    n_updates            | 570       |
|    policy_gradient_loss | -0.0687   |
|    value_loss           | 2.56      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 48.63939393939394
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 66          |
|    time_elapsed         | 623         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016941588 |
|    clip_fraction        | 0.341       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.837       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.238       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0688     |
|    value_loss           | 2.41

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 52.27117117117117
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 74          |
|    time_elapsed         | 700         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.016967721 |
|    clip_fraction        | 0.339       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.849       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.108       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0689     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 55.31544715447154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 82          |
|    time_elapsed         | 776         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018649027 |
|    clip_fraction        | 0.375       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.868       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.969       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0702     |
|    value_loss           | 2.1 

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 58.27037037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 90          |
|    time_elapsed         | 851         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.017704576 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.874       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.881       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0728     |
|    value_loss           | 1.92

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 61.24081632653061
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -172        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 98          |
|    time_elapsed         | 926         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019859372 |
|    clip_fraction        | 0.419       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.1        |
|    explained_variance   | 0.872       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.362       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0754     |
|    value_loss           | 1.89

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 6.4
All assignments history: [8, 15, 6, 6, 11, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 166      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -142.0
Standard deviation of reward: 0.0
Average successful assignments: 14.166666666666666
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 130        |
|    iterations           | 2          |
|    time_elapsed         | 15         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00893743 |
|    clip_fraction        | 0.0597     |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.29      |
|    explained_variance   | -0.189     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.17       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0542    |
|    value_loss           | 16.1       |
-------

-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 27.673333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 10          |
|    time_elapsed         | 93          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020267874 |
|    clip_fraction        | 0.446       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0783      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.36        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0633     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -66.0
Standard deviation of reward: 0.0
Average successful assignments: 33.27407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 18          |
|    time_elapsed         | 169         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.021207131 |
|    clip_fraction        | 0.437       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.544       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.68        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0666     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 29.638461538461538
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 26          |
|    time_elapsed         | 246         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020342538 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.669       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.94        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0679     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 36.07058823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 34          |
|    time_elapsed         | 324         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.022093652 |
|    clip_fraction        | 0.445       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.743       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0746     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 38.993650793650794
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 42          |
|    time_elapsed         | 400         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.021930521 |
|    clip_fraction        | 0.444       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.782       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0732     |
|    value_loss           | 3 

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 41.422666666666665
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 50          |
|    time_elapsed         | 475         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.021130478 |
|    clip_fraction        | 0.42        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.805       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.672       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0739     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 44.216091954022986
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 58          |
|    time_elapsed         | 550         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.020306878 |
|    clip_fraction        | 0.41        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.849       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.828       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0754     |
|    value_loss           | 2.25

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 47.6989898989899
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 66          |
|    time_elapsed         | 615         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016984537 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.875       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.401       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0711     |
|    value_loss           | 2.02  

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 50.32612612612613
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 74          |
|    time_elapsed         | 679         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017764144 |
|    clip_fraction        | 0.342       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.896       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0778     |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0761     |
|    value_loss           | 1.67 

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 52.5130081300813
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -178       |
| time/                   |            |
|    fps                  | 113        |
|    iterations           | 82         |
|    time_elapsed         | 740        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01979295 |
|    clip_fraction        | 0.4        |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.908      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.0487    |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0809    |
|    value_loss           | 1.48       |
------------

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 54.26888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 90          |
|    time_elapsed         | 801         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018620599 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.915       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.101       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0812     |
|    value_loss           | 1.44

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 56.08163265306123
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 98          |
|    time_elapsed         | 859         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.021216623 |
|    clip_fraction        | 0.429       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.921       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.102      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0839     |
|    value_loss           | 1.28

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.133333333333334
All assignments history: [6, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 188      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -152.0
Standard deviation of reward: 0.0
Average successful assignments: 12.766666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 156         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008743267 |
|    clip_fraction        | 0.0656      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.00796    |
|    learning_rate        | 0.00018     |
|    loss                 | 2.13        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 11.113333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 10          |
|    time_elapsed         | 74          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020099685 |
|    clip_fraction        | 0.436       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0818      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.801       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0618     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -90.0
Standard deviation of reward: 0.0
Average successful assignments: 17.92222222222222
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 18          |
|    time_elapsed         | 133         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025716977 |
|    clip_fraction        | 0.559       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.396       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.357       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0654     |
|    value_loss           | 4.4

-------- Rollout Summary --------
Total mean reward: -130.0
Standard deviation of reward: 0.0
Average successful assignments: 19.68717948717949
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 26          |
|    time_elapsed         | 193         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020427722 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.603       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.133       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0646     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 27.735294117647058
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 34          |
|    time_elapsed         | 253         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.020704381 |
|    clip_fraction        | 0.415       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.687       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0684     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 34.217460317460315
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 42          |
|    time_elapsed         | 314         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017253172 |
|    clip_fraction        | 0.336       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.728       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0655     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -32.0
Standard deviation of reward: 0.0
Average successful assignments: 37.792
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 50          |
|    time_elapsed         | 374         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.017351896 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.761       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.067      |
|    value_loss           | 2.97        |


-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 41.65172413793103
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 58          |
|    time_elapsed         | 434         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015479231 |
|    clip_fraction        | 0.29        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.773       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0662     |
|    value_loss           | 2.91

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 44.382828282828285
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 66          |
|    time_elapsed         | 495         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.014000438 |
|    clip_fraction        | 0.271       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.803       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.196       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0634     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 46.54504504504504
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 74          |
|    time_elapsed         | 555         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017121674 |
|    clip_fraction        | 0.345       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.823       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.148      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.072      |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 48.71138211382114
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 82          |
|    time_elapsed         | 616         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018478503 |
|    clip_fraction        | 0.365       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.872       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0461      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0737     |
|    value_loss           | 1.7

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 51.22666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 90          |
|    time_elapsed         | 677         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.017932096 |
|    clip_fraction        | 0.359       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.892       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.224       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0744     |
|    value_loss           | 1.57

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 54.3421768707483
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 98          |
|    time_elapsed         | 737         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020221878 |
|    clip_fraction        | 0.425       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.898       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0811      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0807     |
|    value_loss           | 1.46 

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.4
All assignments history: [5, 8, 13, 7, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 179      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -146.0
Standard deviation of reward: 0.0
Average successful assignments: 13.666666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 154         |
|    iterations           | 2           |
|    time_elapsed         | 13          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008541726 |
|    clip_fraction        | 0.0562      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.0107     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.66        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 15.586666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 138         |
|    iterations           | 10          |
|    time_elapsed         | 73          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019648036 |
|    clip_fraction        | 0.438       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0995      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.24        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0628     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -88.0
Standard deviation of reward: 0.0
Average successful assignments: 23.703703703703702
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 138         |
|    iterations           | 18          |
|    time_elapsed         | 132         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.021483602 |
|    clip_fraction        | 0.463       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.31        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0655     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -144.0
Standard deviation of reward: 0.0
Average successful assignments: 25.615384615384617
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 139         |
|    iterations           | 26          |
|    time_elapsed         | 190         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018831879 |
|    clip_fraction        | 0.365       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.668       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0643     |
|    value_loss           | 3

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 31.86078431372549
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 34          |
|    time_elapsed         | 252         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.017728876 |
|    clip_fraction        | 0.339       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.754       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.53        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0659     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 37.665079365079364
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 42          |
|    time_elapsed         | 316         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.016096734 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.838       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.695       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0687     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 43.48133333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 50          |
|    time_elapsed         | 381         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015510194 |
|    clip_fraction        | 0.304       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.859       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.38        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0723     |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 46.94137931034483
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 134         |
|    iterations           | 58          |
|    time_elapsed         | 442         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017319627 |
|    clip_fraction        | 0.352       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.896       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.188       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0749     |
|    value_loss           | 1.6  

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 50.73535353535353
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 66          |
|    time_elapsed         | 504         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017077869 |
|    clip_fraction        | 0.343       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.9         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0297      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0764     |
|    value_loss           | 1.63

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 54.1981981981982
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 74          |
|    time_elapsed         | 569         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017706364 |
|    clip_fraction        | 0.368       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.917       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.158       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0787     |
|    value_loss           | 1.47 

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 57.1219512195122
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -176       |
| time/                   |            |
|    fps                  | 132        |
|    iterations           | 82         |
|    time_elapsed         | 634        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01673846 |
|    clip_fraction        | 0.362      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.17      |
|    explained_variance   | 0.919      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.00128    |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0784    |
|    value_loss           | 1.38       |
-----------

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 59.50592592592593
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -173       |
| time/                   |            |
|    fps                  | 131        |
|    iterations           | 90         |
|    time_elapsed         | 698        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.01719652 |
|    clip_fraction        | 0.347      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.13      |
|    explained_variance   | 0.924      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.192     |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0765    |
|    value_loss           | 1.18       |
----------

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 61.410204081632656
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -170        |
| time/                   |             |
|    fps                  | 131         |
|    iterations           | 98          |
|    time_elapsed         | 763         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016970018 |
|    clip_fraction        | 0.348       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.05       |
|    explained_variance   | 0.92        |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0271     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0784     |
|    value_loss           | 1.3

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.733333333333333
All assignments history: [9, 8, 6, 13, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 170      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -122.0
Standard deviation of reward: 0.0
Average successful assignments: 18.066666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 146         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008524009 |
|    clip_fraction        | 0.0557      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.231      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.83        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 16.673333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 10          |
|    time_elapsed         | 78          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020035664 |
|    clip_fraction        | 0.438       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0772      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.54        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.063      |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 30.5
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -183       |
| time/                   |            |
|    fps                  | 127        |
|    iterations           | 18         |
|    time_elapsed         | 144        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02348449 |
|    clip_fraction        | 0.503      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.517      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.5        |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.07      |
|    value_loss           | 4.09       |
----------------------

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 43.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 26          |
|    time_elapsed         | 209         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.023345754 |
|    clip_fraction        | 0.477       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.661       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.136       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0722     |
|    value_loss           | 3.75        |
--

-------- Rollout Summary --------
Total mean reward: -64.0
Standard deviation of reward: 0.0
Average successful assignments: 43.35098039215686
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 34          |
|    time_elapsed         | 275         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.020295529 |
|    clip_fraction        | 0.421       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.743       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.16        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0715     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 45.455555555555556
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 42          |
|    time_elapsed         | 339         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019746177 |
|    clip_fraction        | 0.399       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.8         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.223       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.069      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 48.20933333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 50         |
|    time_elapsed         | 403        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01919023 |
|    clip_fraction        | 0.39       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.834      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.604      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0701    |
|    value_loss           | 2.32       |
---------

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 49.54367816091954
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 58          |
|    time_elapsed         | 471         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.020223431 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.859       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.678       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0715     |
|    value_loss           | 2.0

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 51.352525252525254
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 66          |
|    time_elapsed         | 536         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.014968449 |
|    clip_fraction        | 0.278       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.866       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.216       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0678     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 52.3
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -179       |
| time/                   |            |
|    fps                  | 127        |
|    iterations           | 74         |
|    time_elapsed         | 594        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01665857 |
|    clip_fraction        | 0.304      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.882      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.298      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0691    |
|    value_loss           | 1.78       |
----------------------

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 53.170731707317074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 82          |
|    time_elapsed         | 652         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018666634 |
|    clip_fraction        | 0.385       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.903       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.235       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0778     |
|    value_loss           | 1.

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 54.78074074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 90          |
|    time_elapsed         | 711         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018676108 |
|    clip_fraction        | 0.365       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.91        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0517      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0762     |
|    value_loss           | 1.43

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 56.33061224489796
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 130         |
|    iterations           | 98          |
|    time_elapsed         | 768         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019594016 |
|    clip_fraction        | 0.41        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.895       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0655      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0814     |
|    value_loss           | 1.61

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 6.0
All assignments history: [9, 8, 8, 8, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 165      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -160.0
Standard deviation of reward: 0.0
Average successful assignments: 11.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 138         |
|    iterations           | 2           |
|    time_elapsed         | 14          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009393228 |
|    clip_fraction        | 0.071       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.39       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.5         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 15.6        |
-

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 14.74
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 10          |
|    time_elapsed         | 80          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019930582 |
|    clip_fraction        | 0.448       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.107       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.66        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0623     |
|    value_loss           | 4.84        |
--

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 37.58888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 18          |
|    time_elapsed         | 147         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023365173 |
|    clip_fraction        | 0.491       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.604       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.83        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0683     |
|    value_loss           | 3.52

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 48.276923076923076
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 26          |
|    time_elapsed         | 213         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.024724415 |
|    clip_fraction        | 0.509       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.713       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.832       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0733     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 51.62352941176471
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 34          |
|    time_elapsed         | 278         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.022702549 |
|    clip_fraction        | 0.46        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.781       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.551       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.073      |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 52.58412698412698
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 42          |
|    time_elapsed         | 343         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017627269 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.833       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.542       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0685     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 53.588
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 50         |
|    time_elapsed         | 406        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01972908 |
|    clip_fraction        | 0.38       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.25      |
|    explained_variance   | 0.862      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.195      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0699    |
|    value_loss           | 2.11       |
--------------------

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 54.70919540229885
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 58          |
|    time_elapsed         | 470         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.018904686 |
|    clip_fraction        | 0.353       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.873       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.24        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0685     |
|    value_loss           | 2.1 

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 56.537373737373734
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 66          |
|    time_elapsed         | 533         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018341595 |
|    clip_fraction        | 0.361       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.889       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0865      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0687     |
|    value_loss           | 1.7

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 58.24324324324324
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -179       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 74         |
|    time_elapsed         | 597        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.02044028 |
|    clip_fraction        | 0.413      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.883      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.231      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0734    |
|    value_loss           | 1.98       |
----------

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 60.01869918699187
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 82          |
|    time_elapsed         | 661         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018772345 |
|    clip_fraction        | 0.372       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.894       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.114       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0725     |
|    value_loss           | 1.7 

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 61.644444444444446
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 90          |
|    time_elapsed         | 723         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019085556 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.904       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.336       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0718     |
|    value_loss           | 1.5

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 63.03809523809524
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 98          |
|    time_elapsed         | 786         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020152047 |
|    clip_fraction        | 0.407       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.898       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.055      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0737     |
|    value_loss           | 1.63

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 10.066666666666666
All assignments history: [14, 8, 10, 8, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -180     |
| time/              |          |
|    fps             | 590      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -132.0
Standard deviation of reward: 0.0
Average successful assignments: 17.666666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 472         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008878497 |
|    clip_fraction        | 0.06        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.147      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0525     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 18.04
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 420         |
|    iterations           | 10          |
|    time_elapsed         | 24          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018513288 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.11        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0562     |
|    value_loss           | 5.59        |


-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 32.92962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 411         |
|    iterations           | 18          |
|    time_elapsed         | 44          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023798667 |
|    clip_fraction        | 0.519       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.494       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.67        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0682     |
|    value_loss           | 4.18

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 41.55384615384615
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 409         |
|    iterations           | 26          |
|    time_elapsed         | 64          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.021033436 |
|    clip_fraction        | 0.424       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.68        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.492       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0679     |
|    value_loss           | 3.48

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 46.39607843137255
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 408         |
|    iterations           | 34          |
|    time_elapsed         | 85          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019495921 |
|    clip_fraction        | 0.369       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.768       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.407       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0649     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 50.352380952380955
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 408         |
|    iterations           | 42          |
|    time_elapsed         | 105         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.021200445 |
|    clip_fraction        | 0.419       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.808       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.294       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0697     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 55.228
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 404         |
|    iterations           | 50          |
|    time_elapsed         | 126         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.020357605 |
|    clip_fraction        | 0.423       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.852       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.213       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0739     |
|    value_loss           | 2.09        |
-

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 58.641379310344824
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 398         |
|    iterations           | 58          |
|    time_elapsed         | 148         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.016375605 |
|    clip_fraction        | 0.311       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.885       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.00775    |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0685     |
|    value_loss           | 1.7

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 61.9040404040404
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 394         |
|    iterations           | 66          |
|    time_elapsed         | 171         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.015241884 |
|    clip_fraction        | 0.289       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.888       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0124      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0678     |
|    value_loss           | 1.71 

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 64.26126126126127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 391         |
|    iterations           | 74          |
|    time_elapsed         | 193         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.016410183 |
|    clip_fraction        | 0.337       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.2        |
|    explained_variance   | 0.91        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.359       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0748     |
|    value_loss           | 1.42

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 66.04634146341463
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 388         |
|    iterations           | 82          |
|    time_elapsed         | 216         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019745536 |
|    clip_fraction        | 0.405       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.16       |
|    explained_variance   | 0.916       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.088      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0823     |
|    value_loss           | 1.32

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 68.01629629629629
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 386         |
|    iterations           | 90          |
|    time_elapsed         | 238         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018953372 |
|    clip_fraction        | 0.41        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.08       |
|    explained_variance   | 0.898       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0212      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0848     |
|    value_loss           | 1.51

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 69.84965986394558
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -166        |
| time/                   |             |
|    fps                  | 384         |
|    iterations           | 98          |
|    time_elapsed         | 261         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.020444762 |
|    clip_fraction        | 0.442       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.91       |
|    explained_variance   | 0.906       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.143      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0872     |
|    value_loss           | 1.29

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks200.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -182.0
Standard deviation of reward: 0.0
Average successful assignments: 8.466666666666667
All assignments history: [8, 9, 9, 3, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 524      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -134.0
Standard deviation of reward: 0.0
Average successful assignments: 16.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 433         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009080526 |
|    clip_fraction        | 0.0736      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.0991     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.81        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 16.9        |
-

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 34.153333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019856341 |
|    clip_fraction        | 0.429       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.132       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.27        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0607     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 43.03703703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 375         |
|    iterations           | 18          |
|    time_elapsed         | 49          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025367308 |
|    clip_fraction        | 0.533       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.565       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.911       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.072      |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 48.45641025641026
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 26          |
|    time_elapsed         | 72          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018552922 |
|    clip_fraction        | 0.373       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.726       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.406       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0668     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 51.286274509803924
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 34          |
|    time_elapsed         | 94          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018577185 |
|    clip_fraction        | 0.383       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.811       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0706     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 53.91111111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 42          |
|    time_elapsed         | 116         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.019859191 |
|    clip_fraction        | 0.41        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.877       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.189       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0771     |
|    value_loss           | 1.84

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 56.510666666666665
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 369        |
|    iterations           | 50         |
|    time_elapsed         | 138        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01868824 |
|    clip_fraction        | 0.36       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.25      |
|    explained_variance   | 0.92       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.147      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0733    |
|    value_loss           | 1.31       |
---------

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 58.90574712643678
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 373        |
|    iterations           | 58         |
|    time_elapsed         | 158        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01792594 |
|    clip_fraction        | 0.353      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.929      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.097     |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0751    |
|    value_loss           | 1.28       |
----------

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 61.64343434343434
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 377         |
|    iterations           | 66          |
|    time_elapsed         | 179         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017008308 |
|    clip_fraction        | 0.35        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.933       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.214      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0788     |
|    value_loss           | 1.25

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 64.04144144144144
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -178       |
| time/                   |            |
|    fps                  | 379        |
|    iterations           | 74         |
|    time_elapsed         | 199        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01901958 |
|    clip_fraction        | 0.398      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.944      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.29      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0807    |
|    value_loss           | 1          |
----------

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 66.72764227642277
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 382         |
|    iterations           | 82          |
|    time_elapsed         | 219         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.020683229 |
|    clip_fraction        | 0.439       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.948       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.211      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0873     |
|    value_loss           | 0.92

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 68.80888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 385         |
|    iterations           | 90          |
|    time_elapsed         | 239         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.020105628 |
|    clip_fraction        | 0.424       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.947       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.249      |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0866     |
|    value_loss           | 0.91

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 70.55578231292517
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 386         |
|    iterations           | 98          |
|    time_elapsed         | 259         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017661896 |
|    clip_fraction        | 0.391       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.04       |
|    explained_variance   | 0.921       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.139      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.084      |
|    value_loss           | 1.25