In [2]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
vehicles_df = pd.read_csv('VehicleTrainingDataset_Noisy_0.01.csv')
tasks_df = pd.read_csv('RandomTasks400.csv')

# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.action_space = spaces.Discrete(len(vehicles))
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(tasks.shape[1],), dtype=np.float32)
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0  # Reset successful assignments
        #print(f"Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        # Detailed printout of state, action, reward
        
        #lines below can be uncommented for a more detailed output
        #print(f"Task: {task.to_dict()}")
        #print(f"Chosen Vehicle: {vehicle.to_dict()}")
        #print(f"Action: {action}, Reward: {reward}, Done: {done}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0

    def render(self, mode='human'):
        pass

    def close(self):
        pass


# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1
        
        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")

# Load new task dataset for testing
new_tasks_df = pd.read_csv('RandomTasks400Test.csv')
new_tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate'
}, inplace=True)
test_env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, new_tasks_df), n_envs=1)

# Evaluate the model on the new test environment
mean_reward, std_reward = evaluate_policy(model, test_env, n_eval_episodes=10)
# Extract the successful assignments history from the test environment
successful_assignments = test_env.envs[0].env.get_average_success()

print("---- Testing Summary ----")
print(f"Mean Reward: {mean_reward}")
print(f"Standard Deviation of Reward: {std_reward}")
print(f"Average Successful Assignments in Testing: {successful_assignments}")



Using cpu device
-------- Rollout Summary --------
Total mean reward: -366.0
Standard deviation of reward: 0.0
Average successful assignments: 16.666666666666668
All assignments history: [15, 15, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -370     |
| time/              |          |
|    fps             | 307      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -324.0
Standard deviation of reward: 0.0
Average successful assignments: 25.291666666666668
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 281        |
|    iterations           | 2          |
|    time_elapsed         | 7          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00803951 |
|    clip_fraction        | 0.0563     |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.29      |
|    explained_variance   | -0.277     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.33       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0498    |
|    value_loss           | 16.1       |
-------

-------- Rollout Summary --------
Total mean reward: -298.0
Standard deviation of reward: 0.0
Average successful assignments: 36.55
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 263         |
|    iterations           | 10          |
|    time_elapsed         | 38          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.018954322 |
|    clip_fraction        | 0.402       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.000534    |
|    learning_rate        | 0.00018     |
|    loss                 | 0.454       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0629     |
|    value_loss           | 4.31        |


-------- Rollout Summary --------
Total mean reward: -248.0
Standard deviation of reward: 0.0
Average successful assignments: 36.26851851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.029300708 |
|    clip_fraction        | 0.612       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.28       |
|    explained_variance   | 0.0809      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.91        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0638     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -68.0
Standard deviation of reward: 0.0
Average successful assignments: 55.81730769230769
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 230        |
|    iterations           | 26         |
|    time_elapsed         | 115        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.02366012 |
|    clip_fraction        | 0.491      |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.27      |
|    explained_variance   | 0.245      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.83       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0649    |
|    value_loss           | 3.23       |
---------

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 78.45098039215686
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 225         |
|    iterations           | 34          |
|    time_elapsed         | 154         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.017593559 |
|    clip_fraction        | 0.328       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.26       |
|    explained_variance   | 0.312       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.302       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0583     |
|    value_loss           | 2.7  

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 98.2936507936508
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -371        |
| time/                   |             |
|    fps                  | 218         |
|    iterations           | 42          |
|    time_elapsed         | 197         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.014833454 |
|    clip_fraction        | 0.27        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.24       |
|    explained_variance   | 0.384       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.521       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0581     |
|    value_loss           | 2.63 

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 112.17833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 210         |
|    iterations           | 50          |
|    time_elapsed         | 243         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.013836836 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.22       |
|    explained_variance   | 0.455       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.35       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0539     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 122.44827586206897
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 205         |
|    iterations           | 58          |
|    time_elapsed         | 288         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.015876496 |
|    clip_fraction        | 0.269       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.21       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0575     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 130.60858585858585
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -367       |
| time/                   |            |
|    fps                  | 203        |
|    iterations           | 66         |
|    time_elapsed         | 332        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01632578 |
|    clip_fraction        | 0.29       |
|    clip_range           | 0.15       |
|    entropy_loss         | -8.19      |
|    explained_variance   | 0.493      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.08       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0613    |
|    value_loss           | 2.45       |
---------

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 139.05405405405406
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 201         |
|    iterations           | 74          |
|    time_elapsed         | 376         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.016125081 |
|    clip_fraction        | 0.32        |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.15       |
|    explained_variance   | 0.493       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0684     |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0648     |
|    value_loss           | 2.3

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 145.9959349593496
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 200         |
|    iterations           | 82          |
|    time_elapsed         | 419         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.017059248 |
|    clip_fraction        | 0.348       |
|    clip_range           | 0.15        |
|    entropy_loss         | -8.08       |
|    explained_variance   | 0.5         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.787       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0683     |
|    value_loss           | 2.52

-------- Rollout Summary --------
Total mean reward: 92.0
Standard deviation of reward: 0.0
Average successful assignments: 151.72222222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -354        |
| time/                   |             |
|    fps                  | 199         |
|    iterations           | 90          |
|    time_elapsed         | 462         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.018000253 |
|    clip_fraction        | 0.371       |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.96       |
|    explained_variance   | 0.468       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.42        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0726     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 157.11479591836735
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -344        |
| time/                   |             |
|    fps                  | 197         |
|    iterations           | 98          |
|    time_elapsed         | 507         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.017511066 |
|    clip_fraction        | 0.36        |
|    clip_range           | 0.15        |
|    entropy_loss         | -7.68       |
|    explained_variance   | 0.439       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.22        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0727     |
|    value_loss           | 2.



---- Testing Summary ----
Mean Reward: 86.0
Standard Deviation of Reward: 0.0
Average Successful Assignments in Testing: 243.0
