In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.583333333333334
All assignments history: [11, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 144      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -342.0
Standard deviation of reward: 0.0
Average successful assignments: 19.25
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -374         |
| time/                   |              |
|    fps                  | 140          |
|    iterations           | 2            |
|    time_elapsed         | 14           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0081374915 |
|    clip_fraction        | 0.0505       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | 0.0444       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.82         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0501      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: -314.0
Standard deviation of reward: 0.0
Average successful assignments: 29.358333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 10          |
|    time_elapsed         | 90          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019628674 |
|    clip_fraction        | 0.423       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.000585    |
|    learning_rate        | 0.00018     |
|    loss                 | 1.62        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0647     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 70.63425925925925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 102         |
|    iterations           | 18          |
|    time_elapsed         | 180         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023894338 |
|    clip_fraction        | 0.524       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.152       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.243       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0626     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -310.0
Standard deviation of reward: 0.0
Average successful assignments: 75.56089743589743
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 26          |
|    time_elapsed         | 272         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.021043818 |
|    clip_fraction        | 0.433       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.287       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.26        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.066      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -78.0
Standard deviation of reward: 0.0
Average successful assignments: 84.17892156862744
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 34          |
|    time_elapsed         | 365         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018334553 |
|    clip_fraction        | 0.326       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.346       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.852       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0599     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -58.0
Standard deviation of reward: 0.0
Average successful assignments: 96.85317460317461
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 42          |
|    time_elapsed         | 462         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.016907314 |
|    clip_fraction        | 0.299       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.442       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 103.18666666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -370       |
| time/                   |            |
|    fps                  | 91         |
|    iterations           | 50         |
|    time_elapsed         | 560        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01605717 |
|    clip_fraction        | 0.298      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.23      |
|    explained_variance   | 0.465      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.14       |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0627    |
|    value_loss           | 2.49       |
--------

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 108.36637931034483
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 58          |
|    time_elapsed         | 658         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015788542 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.49        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0641     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 117.41919191919192
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 66          |
|    time_elapsed         | 761         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.015432591 |
|    clip_fraction        | 0.283       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.5         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.966       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0609     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 124.97522522522523
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 74          |
|    time_elapsed         | 858         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.017818524 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.528       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0845      |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0661     |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 131.10365853658536
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 82          |
|    time_elapsed         | 957         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.017905653 |
|    clip_fraction        | 0.362       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.04       |
|    explained_variance   | 0.456       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.7         |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 136.5574074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -351        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 90          |
|    time_elapsed         | 1058        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.019281505 |
|    clip_fraction        | 0.418       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.88       |
|    explained_variance   | 0.399       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0776     |
|    value_loss           | 2.73

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 141.21428571428572
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -342        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1174        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.018233355 |
|    clip_fraction        | 0.379       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.63       |
|    explained_variance   | 0.434       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.837       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0747     |
|    value_loss           | 2.5

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -378.0
Standard deviation of reward: 0.0
Average successful assignments: 11.416666666666666
All assignments history: [14, 13, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 91       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -310.0
Standard deviation of reward: 0.0
Average successful assignments: 25.291666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -376         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 2            |
|    time_elapsed         | 25           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0080583915 |
|    clip_fraction        | 0.0479       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | -0.00989     |
|    learning_rate        | 0.00018      |
|    loss                 | 3.08         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0499      |
|    value

-------- Rollout Summary --------
Total mean reward: -236.0
Standard deviation of reward: 0.0
Average successful assignments: 30.491666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 10          |
|    time_elapsed         | 138         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020091677 |
|    clip_fraction        | 0.435       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.00101     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.113       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0633     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -226.0
Standard deviation of reward: 0.0
Average successful assignments: 62.379629629629626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 18          |
|    time_elapsed         | 250         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.027486108 |
|    clip_fraction        | 0.569       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.193       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0668     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 67.68910256410257
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 26          |
|    time_elapsed         | 370         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.023153052 |
|    clip_fraction        | 0.468       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.35        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.177       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0673     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -130.0
Standard deviation of reward: 0.0
Average successful assignments: 77.87990196078431
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 34          |
|    time_elapsed         | 483         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019322634 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.423       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.293      |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.063      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -114.0
Standard deviation of reward: 0.0
Average successful assignments: 86.0734126984127
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 72         |
|    iterations           | 42         |
|    time_elapsed         | 596        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01829672 |
|    clip_fraction        | 0.331      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.451      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.571      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0619    |
|    value_loss           | 2.57       |
---------

-------- Rollout Summary --------
Total mean reward: -126.0
Standard deviation of reward: 0.0
Average successful assignments: 89.72166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 50          |
|    time_elapsed         | 711         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015290202 |
|    clip_fraction        | 0.28        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.525       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.521       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0604     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -110.0
Standard deviation of reward: 0.0
Average successful assignments: 93.066091954023
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 58          |
|    time_elapsed         | 833         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.016078316 |
|    clip_fraction        | 0.295       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.576       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.524       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0624     |
|    value_loss           | 2.22

-------- Rollout Summary --------
Total mean reward: -48.0
Standard deviation of reward: 0.0
Average successful assignments: 99.49873737373737
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 66          |
|    time_elapsed         | 952         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016559474 |
|    clip_fraction        | 0.325       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.593       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0204      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0685     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: -78.0
Standard deviation of reward: 0.0
Average successful assignments: 103.85022522522523
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -366       |
| time/                   |            |
|    fps                  | 70         |
|    iterations           | 74         |
|    time_elapsed         | 1072       |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01731053 |
|    clip_fraction        | 0.325      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.19      |
|    explained_variance   | 0.603      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.35       |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.068     |
|    value_loss           | 2.13       |
--------

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 107.64837398373983
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 82          |
|    time_elapsed         | 1189        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.017380659 |
|    clip_fraction        | 0.338       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.16       |
|    explained_variance   | 0.605       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.539       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0689     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 113.68611111111112
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 90          |
|    time_elapsed         | 1312        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018567266 |
|    clip_fraction        | 0.396       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.11       |
|    explained_variance   | 0.667       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.803       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0746     |
|    value_loss           | 1.8

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 119.82227891156462
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -358      |
| time/                   |           |
|    fps                  | 69        |
|    iterations           | 98        |
|    time_elapsed         | 1433      |
|    total_timesteps      | 100352    |
| train/                  |           |
|    approx_kl            | 0.0190287 |
|    clip_fraction        | 0.389     |
|    clip_range           | 0.15      |
|    entropy_loss         | -8.04     |
|    explained_variance   | 0.655     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.271     |
|    n_updates            | 970       |
|    policy_gradient_loss | -0.0722   |
|    value_loss           | 1.79      |
-----------------------------

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 6.583333333333333
All assignments history: [14, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -381     |
| time/              |          |
|    fps             | 112      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -342.0
Standard deviation of reward: 0.0
Average successful assignments: 16.541666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -376        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008579325 |
|    clip_fraction        | 0.059       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.153      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.53        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -340.0
Standard deviation of reward: 0.0
Average successful assignments: 42.45
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 10          |
|    time_elapsed         | 109         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019020692 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.00952     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.923       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0601     |
|    value_loss           | 4.15        |


-------- Rollout Summary --------
Total mean reward: -294.0
Standard deviation of reward: 0.0
Average successful assignments: 56.68981481481482
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 18          |
|    time_elapsed         | 199         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.023982663 |
|    clip_fraction        | 0.535       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.187       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.838       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0667     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 66.26282051282051
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 26          |
|    time_elapsed         | 291         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020081837 |
|    clip_fraction        | 0.405       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.357       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.284       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0622     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: -200.0
Standard deviation of reward: 0.0
Average successful assignments: 72.45833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 34          |
|    time_elapsed         | 381         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.016493931 |
|    clip_fraction        | 0.316       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.405       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0592     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -48.0
Standard deviation of reward: 0.0
Average successful assignments: 82.94642857142857
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 42          |
|    time_elapsed         | 472         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017781874 |
|    clip_fraction        | 0.32        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.484       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.726       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0607     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 94.69333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 50          |
|    time_elapsed         | 563         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015885076 |
|    clip_fraction        | 0.294       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.465       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.996       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0624     |
|    value_loss           | 2.5  

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 105.84913793103448
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 58          |
|    time_elapsed         | 653         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015320214 |
|    clip_fraction        | 0.279       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.514       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.06       |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 115.79797979797979
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -364       |
| time/                   |            |
|    fps                  | 90         |
|    iterations           | 66         |
|    time_elapsed         | 744        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01638932 |
|    clip_fraction        | 0.313      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.14      |
|    explained_variance   | 0.511      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.436      |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0668    |
|    value_loss           | 2.54       |
---------

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 123.64752252252252
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 74          |
|    time_elapsed         | 836         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.019192655 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.07       |
|    explained_variance   | 0.525       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.326       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0664     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 130.03455284552845
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 82          |
|    time_elapsed         | 927         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018169947 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.94       |
|    explained_variance   | 0.585       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.8         |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0737     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 135.3074074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 90          |
|    time_elapsed         | 1019        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018202337 |
|    clip_fraction        | 0.386       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.78       |
|    explained_variance   | 0.641       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.483       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0747     |
|    value_loss           | 2.16

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 140.05527210884352
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -337        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 98          |
|    time_elapsed         | 1106        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017377991 |
|    clip_fraction        | 0.368       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.62       |
|    explained_variance   | 0.655       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.704       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0763     |
|    value_loss           | 2.3

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -380.0
Standard deviation of reward: 0.0
Average successful assignments: 10.75
All assignments history: [15, 14, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 111      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -326.0
Standard deviation of reward: 0.0
Average successful assignments: 22.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.009243114 |
|    clip_fraction        | 0.0732      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.199      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.73        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 17          |
-

-------- Rollout Summary --------
Total mean reward: -208.0
Standard deviation of reward: 0.0
Average successful assignments: 63.858333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 10          |
|    time_elapsed         | 115         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.019965034 |
|    clip_fraction        | 0.435       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0029      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.599       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0637     |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 97.02777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 18          |
|    time_elapsed         | 211         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.029445704 |
|    clip_fraction        | 0.61        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.0543      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0652     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: -102.0
Standard deviation of reward: 0.0
Average successful assignments: 112.46474358974359
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 26          |
|    time_elapsed         | 301         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.023465049 |
|    clip_fraction        | 0.489       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.248       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.236      |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0661     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 120.67401960784314
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 34          |
|    time_elapsed         | 397         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.019163208 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.34        |
|    learning_rate        | 0.00018     |
|    loss                 | -0.21       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0607     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 129.75396825396825
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 493         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.018331118 |
|    clip_fraction        | 0.352       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.49        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0631     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 138.85333333333332
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -369       |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 50         |
|    time_elapsed         | 590        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.02098663 |
|    clip_fraction        | 0.398      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.485      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.54       |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0641    |
|    value_loss           | 2.51       |
---------

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 145.85632183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 688         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017865226 |
|    clip_fraction        | 0.33        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0624     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 151.57449494949495
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 66         |
|    time_elapsed         | 786        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01952626 |
|    clip_fraction        | 0.367      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.2       |
|    explained_variance   | 0.489      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.47       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0621    |
|    value_loss           | 2.42       |
---------

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 155.8108108108108
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 74          |
|    time_elapsed         | 882         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018516041 |
|    clip_fraction        | 0.362       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.439       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0677     |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0658     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 159.3099593495935
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 82          |
|    time_elapsed         | 979         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019349776 |
|    clip_fraction        | 0.393       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.05       |
|    explained_variance   | 0.421       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.182      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0661     |
|    value_loss           | 2.38

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 162.27777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 90          |
|    time_elapsed         | 1076        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018709991 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.9        |
|    explained_variance   | 0.391       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0657     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 164.95493197278913
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -347       |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 98         |
|    time_elapsed         | 1168       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.01661986 |
|    clip_fraction        | 0.343      |
|    clip_range           | 0.15       |
|    entropy_loss         | -7.7       |
|    explained_variance   | 0.312      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.604      |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0681    |
|    value_loss           | 2.43       |
---------

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -362.0
Standard deviation of reward: 0.0
Average successful assignments: 18.166666666666668
All assignments history: [15, 13, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 106      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -318.0
Standard deviation of reward: 0.0
Average successful assignments: 27.375
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 94           |
|    iterations           | 2            |
|    time_elapsed         | 21           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0085674375 |
|    clip_fraction        | 0.0601       |
|    clip_range           | 0.15         |
|    entropy_loss         | -8.29        |
|    explained_variance   | -0.223       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.65         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0518      |
|    value_loss       

-------- Rollout Summary --------
Total mean reward: -78.0
Standard deviation of reward: 0.0
Average successful assignments: 92.65833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 10          |
|    time_elapsed         | 117         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018181328 |
|    clip_fraction        | 0.379       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | -0.00465    |
|    learning_rate        | 0.00018     |
|    loss                 | 0.483       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 4.4

-------- Rollout Summary --------
Total mean reward: -304.0
Standard deviation of reward: 0.0
Average successful assignments: 79.54629629629629
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 18          |
|    time_elapsed         | 212         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.029556267 |
|    clip_fraction        | 0.599       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.124       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0946     |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0649     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -214.0
Standard deviation of reward: 0.0
Average successful assignments: 70.30128205128206
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 26          |
|    time_elapsed         | 308         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020260159 |
|    clip_fraction        | 0.39        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.261       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.35       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0604     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -248.0
Standard deviation of reward: 0.0
Average successful assignments: 69.96323529411765
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 34          |
|    time_elapsed         | 404         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.013618596 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.397       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.946       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -206.0
Standard deviation of reward: 0.0
Average successful assignments: 71.67460317460318
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 499         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.014742996 |
|    clip_fraction        | 0.268       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.428       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.198       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0583     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -62.0
Standard deviation of reward: 0.0
Average successful assignments: 78.67833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 593         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016192107 |
|    clip_fraction        | 0.296       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.525       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.068      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0595     |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: -158.0
Standard deviation of reward: 0.0
Average successful assignments: 84.69827586206897
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 689         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015688807 |
|    clip_fraction        | 0.293       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.616       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0452      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.061      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 91.83712121212122
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 66          |
|    time_elapsed         | 780         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.016730398 |
|    clip_fraction        | 0.284       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.585       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.165       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0599     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 99.37387387387388
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 87         |
|    iterations           | 74         |
|    time_elapsed         | 866        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.01679759 |
|    clip_fraction        | 0.333      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.18      |
|    explained_variance   | 0.568      |
|    learning_rate        | 0.00018    |
|    loss                 | -0.202     |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0674    |
|    value_loss           | 2.17       |
-----------

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 106.04471544715447
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 952         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.016336229 |
|    clip_fraction        | 0.312       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.14       |
|    explained_variance   | 0.605       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0638     |
|    value_loss           | 2.04

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 112.37777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 90          |
|    time_elapsed         | 1048        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018466376 |
|    clip_fraction        | 0.384       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.07       |
|    explained_variance   | 0.591       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.496       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0724     |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 118.28061224489795
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -355        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 98          |
|    time_elapsed         | 1148        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.016532324 |
|    clip_fraction        | 0.334       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.94       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.607       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0694     |
|    value_loss           | 2.4

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.083333333333334
All assignments history: [13, 18, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 117      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 15.291666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.010035658 |
|    clip_fraction        | 0.088       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.166      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -154.0
Standard deviation of reward: 0.0
Average successful assignments: 57.88333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -373       |
| time/                   |            |
|    fps                  | 100        |
|    iterations           | 10         |
|    time_elapsed         | 101        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01910078 |
|    clip_fraction        | 0.386      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.00564    |
|    learning_rate        | 0.00018    |
|    loss                 | 1.01       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0582    |
|    value_loss           | 4.6        |
--------

-------- Rollout Summary --------
Total mean reward: -274.0
Standard deviation of reward: 0.0
Average successful assignments: 69.71296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 102         |
|    iterations           | 18          |
|    time_elapsed         | 179         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.028247755 |
|    clip_fraction        | 0.599       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0852      |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0759     |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0638     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -70.0
Standard deviation of reward: 0.0
Average successful assignments: 84.0448717948718
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 26          |
|    time_elapsed         | 254         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.018528948 |
|    clip_fraction        | 0.344       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.285       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.312       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 97.57843137254902
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 34          |
|    time_elapsed         | 327         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.014930086 |
|    clip_fraction        | 0.253       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.334       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.302       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0555     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 109.55357142857143
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 42          |
|    time_elapsed         | 400         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017602704 |
|    clip_fraction        | 0.322       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.412       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0576     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 116.90166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 50          |
|    time_elapsed         | 473         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015659109 |
|    clip_fraction        | 0.285       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.912       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0599     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 123.65948275862068
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -370       |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 58         |
|    time_elapsed         | 547        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01774753 |
|    clip_fraction        | 0.33       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.2       |
|    explained_variance   | 0.512      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.12       |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0623    |
|    value_loss           | 2.3        |
----------

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 129.1489898989899
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 66         |
|    time_elapsed         | 622        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01732027 |
|    clip_fraction        | 0.359      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.16      |
|    explained_variance   | 0.541      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.291      |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0673    |
|    value_loss           | 2.26       |
-----------

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 132.92342342342343
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 74          |
|    time_elapsed         | 696         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018003166 |
|    clip_fraction        | 0.374       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.09       |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.545       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0723     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -46.0
Standard deviation of reward: 0.0
Average successful assignments: 135.0010162601626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 82          |
|    time_elapsed         | 768         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018114932 |
|    clip_fraction        | 0.367       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.99       |
|    explained_variance   | 0.548       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.547       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 137.47222222222223
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -352       |
| time/                   |            |
|    fps                  | 109        |
|    iterations           | 90         |
|    time_elapsed         | 843        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.01987376 |
|    clip_fraction        | 0.412      |
|    clip_range           | 0.15       |
|    entropy_loss         | -7.85      |
|    explained_variance   | 0.571      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.655      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0767    |
|    value_loss           | 2.29       |
---------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 141.5221088435374
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 98          |
|    time_elapsed         | 917         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.018695585 |
|    clip_fraction        | 0.388       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.65       |
|    explained_variance   | 0.573       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.736       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0774     |
|    value_loss           | 2.33

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.5
All assignments history: [17, 19, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -364     |
| time/              |          |
|    fps             | 134      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -330.0
Standard deviation of reward: 0.0
Average successful assignments: 20.875
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008264987 |
|    clip_fraction        | 0.0526      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.192      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.68        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 16.6        |

-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 47.675
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 10          |
|    time_elapsed         | 91          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020130932 |
|    clip_fraction        | 0.45        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0184      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.454       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0656     |
|    value_loss           | 4.01        |

-------- Rollout Summary --------
Total mean reward: -258.0
Standard deviation of reward: 0.0
Average successful assignments: 59.0787037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 18          |
|    time_elapsed         | 176         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025149774 |
|    clip_fraction        | 0.542       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.176       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0726      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0648     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: -260.0
Standard deviation of reward: 0.0
Average successful assignments: 67.87179487179488
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 26          |
|    time_elapsed         | 254         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.022216227 |
|    clip_fraction        | 0.432       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.343       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.284      |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0639     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 72.4656862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 34          |
|    time_elapsed         | 326         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.018609144 |
|    clip_fraction        | 0.366       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0864      |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.062      |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: -60.0
Standard deviation of reward: 0.0
Average successful assignments: 81.90674603174604
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -371       |
| time/                   |            |
|    fps                  | 106        |
|    iterations           | 42         |
|    time_elapsed         | 402        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01697775 |
|    clip_fraction        | 0.325      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.484      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.00621    |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.06      |
|    value_loss           | 2.43       |
---------

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 93.04
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 50          |
|    time_elapsed         | 477         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.016201623 |
|    clip_fraction        | 0.298       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.503       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0586     |
|    value_loss           | 2.46        |
--

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 105.45258620689656
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 58          |
|    time_elapsed         | 549         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017006602 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.538       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.924       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0632     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 114.85984848484848
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 109        |
|    iterations           | 66         |
|    time_elapsed         | 617        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01713822 |
|    clip_fraction        | 0.327      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.18      |
|    explained_variance   | 0.533      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.181      |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.063     |
|    value_loss           | 2.22       |
---------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 122.16216216216216
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 74          |
|    time_elapsed         | 685         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.019201692 |
|    clip_fraction        | 0.394       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.499       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.562       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 128.8760162601626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 82          |
|    time_elapsed         | 753         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.018908547 |
|    clip_fraction        | 0.398       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.03       |
|    explained_variance   | 0.571       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.197       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0731     |
|    value_loss           | 2.23

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 134.51018518518518
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -352        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 90          |
|    time_elapsed         | 822         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018251013 |
|    clip_fraction        | 0.368       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.89       |
|    explained_variance   | 0.638       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.468       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0716     |
|    value_loss           | 2.0

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 139.18367346938774
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 98          |
|    time_elapsed         | 890         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017613297 |
|    clip_fraction        | 0.349       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.73       |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.00223     |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0721     |
|    value_loss           | 2.3

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -378.0
Standard deviation of reward: 0.0
Average successful assignments: 11.333333333333334
All assignments history: [8, 18, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 141      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -302.0
Standard deviation of reward: 0.0
Average successful assignments: 27.416666666666668
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -371       |
| time/                   |            |
|    fps                  | 119        |
|    iterations           | 2          |
|    time_elapsed         | 17         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00902289 |
|    clip_fraction        | 0.0684     |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.29      |
|    explained_variance   | -0.142     |
|    learning_rate        | 0.00018    |
|    loss                 | 3.29       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0528    |
|    value_loss           | 18.4       |
-------

-------- Rollout Summary --------
Total mean reward: -380.0
Standard deviation of reward: 0.0
Average successful assignments: 25.55
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -374       |
| time/                   |            |
|    fps                  | 115        |
|    iterations           | 10         |
|    time_elapsed         | 88         |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01872588 |
|    clip_fraction        | 0.387      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.0104     |
|    learning_rate        | 0.00018    |
|    loss                 | 0.5        |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0585    |
|    value_loss           | 4.78       |
--------------------

-------- Rollout Summary --------
Total mean reward: -230.0
Standard deviation of reward: 0.0
Average successful assignments: 41.97222222222222
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -374       |
| time/                   |            |
|    fps                  | 117        |
|    iterations           | 18         |
|    time_elapsed         | 156        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.02626752 |
|    clip_fraction        | 0.577      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.17       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.425      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0656    |
|    value_loss           | 2.86       |
--------

-------- Rollout Summary --------
Total mean reward: -160.0
Standard deviation of reward: 0.0
Average successful assignments: 51.35897435897436
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 118         |
|    iterations           | 26          |
|    time_elapsed         | 223         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.024295423 |
|    clip_fraction        | 0.501       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.298       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0634     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -182.0
Standard deviation of reward: 0.0
Average successful assignments: 64.23529411764706
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 34          |
|    time_elapsed         | 292         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.021563508 |
|    clip_fraction        | 0.413       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.362       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0623     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 79.71626984126983
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 119        |
|    iterations           | 42         |
|    time_elapsed         | 359        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01648438 |
|    clip_fraction        | 0.287      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.24      |
|    explained_variance   | 0.415      |
|    learning_rate        | 0.00018    |
|    loss                 | 2.9        |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0562    |
|    value_loss           | 2.7        |
----------

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 91.77166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 50          |
|    time_elapsed         | 427         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.017922355 |
|    clip_fraction        | 0.342       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.472       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.214       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0614     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 101.5603448275862
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -368       |
| time/                   |            |
|    fps                  | 119        |
|    iterations           | 58         |
|    time_elapsed         | 494        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.01546696 |
|    clip_fraction        | 0.288      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.21      |
|    explained_variance   | 0.461      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.145      |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0599    |
|    value_loss           | 2.6        |
---------

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 108.93181818181819
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 66          |
|    time_elapsed         | 556         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017889418 |
|    clip_fraction        | 0.354       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.449       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0631     |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0657     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 114.76576576576576
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 74          |
|    time_elapsed         | 616         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.016254766 |
|    clip_fraction        | 0.302       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.12       |
|    explained_variance   | 0.436       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0347     |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0598     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 119.33028455284553
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 125         |
|    iterations           | 82          |
|    time_elapsed         | 671         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.016701369 |
|    clip_fraction        | 0.316       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.03       |
|    explained_variance   | 0.471       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0134      |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0637     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 123.66203703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -352        |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 90          |
|    time_elapsed         | 716         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018402923 |
|    clip_fraction        | 0.363       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.92       |
|    explained_variance   | 0.417       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.588       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0727     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 128.43707482993196
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 98          |
|    time_elapsed         | 758         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017248146 |
|    clip_fraction        | 0.359       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.74       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0743     |
|    value_loss           | 2.3

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -368.0
Standard deviation of reward: 0.0
Average successful assignments: 15.25
All assignments history: [13, 10, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 331      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -344.0
Standard deviation of reward: 0.0
Average successful assignments: 20.916666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 307         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008336947 |
|    clip_fraction        | 0.0547      |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.29       |
|    explained_variance   | -0.212      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.23        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 48.391666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 291         |
|    iterations           | 10          |
|    time_elapsed         | 35          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020186123 |
|    clip_fraction        | 0.441       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.00248     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0639     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -120.0
Standard deviation of reward: 0.0
Average successful assignments: 71.88425925925925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 288         |
|    iterations           | 18          |
|    time_elapsed         | 63          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.025545478 |
|    clip_fraction        | 0.553       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.103       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0349      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0614     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 83.17307692307692
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 286         |
|    iterations           | 26          |
|    time_elapsed         | 92          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.020560425 |
|    clip_fraction        | 0.392       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.27        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.931       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0597     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -114.0
Standard deviation of reward: 0.0
Average successful assignments: 93.8970588235294
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 285         |
|    iterations           | 34          |
|    time_elapsed         | 122         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.016427156 |
|    clip_fraction        | 0.309       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.341       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.105       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: -152.0
Standard deviation of reward: 0.0
Average successful assignments: 99.87103174603175
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 285         |
|    iterations           | 42          |
|    time_elapsed         | 150         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.017129952 |
|    clip_fraction        | 0.301       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.335       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0604     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 107.845
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 285         |
|    iterations           | 50          |
|    time_elapsed         | 179         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.015433932 |
|    clip_fraction        | 0.286       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.432       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.173      |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0598     |
|    value_loss           | 2.45        |


-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 117.39367816091954
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 58          |
|    time_elapsed         | 209         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015307402 |
|    clip_fraction        | 0.298       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.498       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.165      |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0621     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 125.31186868686869
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 66          |
|    time_elapsed         | 237         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.018554619 |
|    clip_fraction        | 0.344       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.19       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0359      |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0656     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 131.39414414414415
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 74          |
|    time_elapsed         | 266         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.018939879 |
|    clip_fraction        | 0.377       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.17       |
|    explained_variance   | 0.569       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.27       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0686     |
|    value_loss           | 2.0

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 136.85772357723576
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -363       |
| time/                   |            |
|    fps                  | 284        |
|    iterations           | 82         |
|    time_elapsed         | 295        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01878965 |
|    clip_fraction        | 0.386      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.11      |
|    explained_variance   | 0.553      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.953      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0712    |
|    value_loss           | 2.22       |
---------

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 141.34259259259258
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 90          |
|    time_elapsed         | 324         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.020849649 |
|    clip_fraction        | 0.438       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.97       |
|    explained_variance   | 0.56        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.776       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0787     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 145.29931972789115
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -351        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 98          |
|    time_elapsed         | 352         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019492995 |
|    clip_fraction        | 0.413       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.8        |
|    explained_variance   | 0.541       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.314       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0788     |
|    value_loss           | 2.0

In [19]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.1.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -358.0
Standard deviation of reward: 0.0
Average successful assignments: 20.333333333333332
All assignments history: [13, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -366     |
| time/              |          |
|    fps             | 345      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -352.0
Standard deviation of reward: 0.0
Average successful assignments: 21.5
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 318        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00853461 |
|    clip_fraction        | 0.0613     |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.29      |
|    explained_variance   | -0.198     |
|    learning_rate        | 0.00018    |
|    loss                 | 1.74       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0503    |
|    value_loss           | 16.9       |
---------------------

-------- Rollout Summary --------
Total mean reward: -254.0
Standard deviation of reward: 0.0
Average successful assignments: 21.658333333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 299         |
|    iterations           | 10          |
|    time_elapsed         | 34          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.020306472 |
|    clip_fraction        | 0.457       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.000932    |
|    learning_rate        | 0.00018     |
|    loss                 | 0.185       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.064      |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: -374.0
Standard deviation of reward: 0.0
Average successful assignments: 32.47222222222222
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 299         |
|    iterations           | 18          |
|    time_elapsed         | 61          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.024701755 |
|    clip_fraction        | 0.545       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.098       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.572       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0587     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -126.0
Standard deviation of reward: 0.0
Average successful assignments: 46.39102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 302         |
|    iterations           | 26          |
|    time_elapsed         | 88          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.019882629 |
|    clip_fraction        | 0.402       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.27       |
|    explained_variance   | 0.32        |
|    learning_rate        | 0.00018     |
|    loss                 | 3.42        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0577     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -48.0
Standard deviation of reward: 0.0
Average successful assignments: 70.48039215686275
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 304        |
|    iterations           | 34         |
|    time_elapsed         | 114        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01812326 |
|    clip_fraction        | 0.339      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.26      |
|    explained_variance   | 0.361      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.244      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0589    |
|    value_loss           | 2.78       |
---------

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 85.1686507936508
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 305         |
|    iterations           | 42          |
|    time_elapsed         | 140         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.015452693 |
|    clip_fraction        | 0.287       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.25       |
|    explained_variance   | 0.444       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.72        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0604     |
|    value_loss           | 2.67 

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 97.71666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 305         |
|    iterations           | 50          |
|    time_elapsed         | 167         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.013444095 |
|    clip_fraction        | 0.246       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.23       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.386       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0568     |
|    value_loss           | 2.76

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 110.34770114942529
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 58          |
|    time_elapsed         | 194         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.017038386 |
|    clip_fraction        | 0.314       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.458       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.54        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0603     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 119.90025252525253
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 66          |
|    time_elapsed         | 221         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.017152017 |
|    clip_fraction        | 0.307       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.18       |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.521       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0612     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 127.38738738738739
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 74          |
|    time_elapsed         | 248         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.019900724 |
|    clip_fraction        | 0.412       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.13       |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.539       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0713     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 133.6371951219512
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 82          |
|    time_elapsed         | 275         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.019632448 |
|    clip_fraction        | 0.384       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.04       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.862       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0674     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 138.85185185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -353        |
| time/                   |             |
|    fps                  | 304         |
|    iterations           | 90          |
|    time_elapsed         | 302         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.021134604 |
|    clip_fraction        | 0.409       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.93       |
|    explained_variance   | 0.403       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.045       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0706     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 143.5374149659864
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 305         |
|    iterations           | 98          |
|    time_elapsed         | 328         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.019311842 |
|    clip_fraction        | 0.405       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.7        |
|    explained_variance   | 0.325       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0738     |
|    value_loss           | 2.65