In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_ridge_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    ridge_model = Ridge()
    ridge_model.fit(X_train_scaled, y_train)
    return ridge_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
ridge_model, scaler = train_ridge_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = ridge_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.09766677979988178
RMSE: 0.12209370048784808
R-squared: 0.9998991762339906
RAE: 0.010200341696916998


In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.833333333333334
All assignments history: [23, 17, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -360     |
| time/              |          |
|    fps             | 63       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -236.0
Standard deviation of reward: 0.0
Average successful assignments: 40.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 2           |
|    time_elapsed         | 32          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007865168 |
|    clip_fraction        | 0.0867      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.0923     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0407     |
|    value_loss           | 13.7        |


-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 130.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 10          |
|    time_elapsed         | 179         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009345334 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00201     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.39        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0403     |
|    value_loss           | 3.84        |
--

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 153.2037037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 332         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009713162 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.215       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.233       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 2.84

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 164.16346153846155
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -344        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 26          |
|    time_elapsed         | 492         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008712022 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.37       |
|    explained_variance   | 0.3         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.51        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 173.25735294117646
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -322        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 34          |
|    time_elapsed         | 645         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008298583 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.86       |
|    explained_variance   | 0.279       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.53        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 114.0
Standard deviation of reward: 0.0
Average successful assignments: 183.61904761904762
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -296       |
| time/                   |            |
|    fps                  | 53         |
|    iterations           | 42         |
|    time_elapsed         | 807        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00939177 |
|    clip_fraction        | 0.187      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.38      |
|    explained_variance   | 0.312      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.69       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0494    |
|    value_loss           | 3.08       |
--------

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 192.82166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -268        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 50          |
|    time_elapsed         | 965         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007514913 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.9        |
|    explained_variance   | 0.563       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0413     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 201.26580459770116
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -221        |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 58          |
|    time_elapsed         | 1137        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007062787 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.54       |
|    explained_variance   | 0.464       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0379     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 208.28156565656565
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -171         |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 66           |
|    time_elapsed         | 1306         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0072328886 |
|    clip_fraction        | 0.149        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.22        |
|    explained_variance   | 0.405        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.09         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0394      |
|    value_

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 214.11373873873873
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -122         |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 74           |
|    time_elapsed         | 1480         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0060248515 |
|    clip_fraction        | 0.107        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.99        |
|    explained_variance   | 0.439        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.27         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0328      |
|    value_

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 218.91260162601625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -78.4       |
| time/                   |             |
|    fps                  | 50          |
|    iterations           | 82          |
|    time_elapsed         | 1652        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008000887 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3          |
|    explained_variance   | 0.384       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0303     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 223.07222222222222
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -45.1        |
| time/                   |              |
|    fps                  | 50           |
|    iterations           | 90           |
|    time_elapsed         | 1824         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0057227863 |
|    clip_fraction        | 0.0958       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.86        |
|    explained_variance   | 0.439        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.11         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0327      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 226.6326530612245
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -19.8        |
| time/                   |              |
|    fps                  | 50           |
|    iterations           | 98           |
|    time_elapsed         | 1990         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0070253583 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.88        |
|    explained_variance   | 0.433        |
|    learning_rate        | 0.00018      |
|    loss                 | 1            |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0345      |
|    value_l

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 9.583333333333334
All assignments history: [14, 11, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -375     |
| time/              |          |
|    fps             | 57       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -362.0
Standard deviation of reward: 0.0
Average successful assignments: 13.875
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -374         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 2            |
|    time_elapsed         | 37           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0072248587 |
|    clip_fraction        | 0.0611       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.156       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.7          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0381      |
|    value_loss       

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 91.14166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 10          |
|    time_elapsed         | 191         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010722457 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00588     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.236       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0449     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 126.89814814814815
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -358       |
| time/                   |            |
|    fps                  | 53         |
|    iterations           | 18         |
|    time_elapsed         | 346        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.00933335 |
|    clip_fraction        | 0.168      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.68      |
|    explained_variance   | 0.24       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.15       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0433    |
|    value_loss           | 2.88       |
---------

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 150.1346153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 26          |
|    time_elapsed         | 497         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009114205 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.47       |
|    explained_variance   | 0.406       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.484       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.046      |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 167.65686274509804
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -327        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 34          |
|    time_elapsed         | 643         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008786926 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.06       |
|    explained_variance   | 0.627       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 180.85119047619048
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -301        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 42          |
|    time_elapsed         | 796         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008965464 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.4        |
|    explained_variance   | 0.597       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0497     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 192.50333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -271         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 50           |
|    time_elapsed         | 947          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0071968553 |
|    clip_fraction        | 0.15         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.79        |
|    explained_variance   | 0.542        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.63         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0451      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 201.86637931034483
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -221         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 58           |
|    time_elapsed         | 1088         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0052702087 |
|    clip_fraction        | 0.0954       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.28        |
|    explained_variance   | 0.457        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.35         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.035       |
|    value_

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 209.4747474747475
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -168         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 66           |
|    time_elapsed         | 1229         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0056812908 |
|    clip_fraction        | 0.0847       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.05        |
|    explained_variance   | 0.485        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0269      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 215.9605855855856
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -115         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 74           |
|    time_elapsed         | 1369         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0055679027 |
|    clip_fraction        | 0.103        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.85        |
|    explained_variance   | 0.424        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.22         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0336      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 221.3231707317073
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -67.7      |
| time/                   |            |
|    fps                  | 55         |
|    iterations           | 82         |
|    time_elapsed         | 1509       |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00561399 |
|    clip_fraction        | 0.122      |
|    clip_range           | 0.15       |
|    entropy_loss         | -2.77      |
|    explained_variance   | 0.48       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.5        |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0359    |
|    value_loss           | 3.19       |
---------

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 225.8935185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -28.1       |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 90          |
|    time_elapsed         | 1653        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005300938 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.56       |
|    explained_variance   | 0.512       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.777       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0321     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 229.76615646258503
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -1.5         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 98           |
|    time_elapsed         | 1794         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0076194964 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.62        |
|    explained_variance   | 0.554        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.24         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0345      |
|    value_

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -290.0
Standard deviation of reward: 0.0
Average successful assignments: 48.25
All assignments history: [18, 11, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 62       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -360.0
Standard deviation of reward: 0.0
Average successful assignments: 33.708333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 2           |
|    time_elapsed         | 34          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007381222 |
|    clip_fraction        | 0.0728      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.217      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.21        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0393     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 89.23333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 10          |
|    time_elapsed         | 173         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010339567 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00211     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.59        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.043      |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 133.97222222222223
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -357         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 18           |
|    time_elapsed         | 311          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0095484415 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.69        |
|    explained_variance   | 0.19         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.281        |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0382      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 154.4551282051282
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -345       |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 26         |
|    time_elapsed         | 453        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00915445 |
|    clip_fraction        | 0.162      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.44      |
|    explained_variance   | 0.228      |
|    learning_rate        | 0.00018    |
|    loss                 | 2.5        |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0432    |
|    value_loss           | 3.22       |
---------

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 169.6421568627451
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -325        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 34          |
|    time_elapsed         | 594         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008524577 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.295       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.864       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0477     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 181.38095238095238
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -296       |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 42         |
|    time_elapsed         | 736        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00920392 |
|    clip_fraction        | 0.161      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.15      |
|    explained_variance   | 0.46       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.03       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0485    |
|    value_loss           | 2.94       |
--------

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 191.85
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -265         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 50           |
|    time_elapsed         | 878          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0056209196 |
|    clip_fraction        | 0.0909       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.6         |
|    explained_variance   | 0.342        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.01         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0378      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 200.617816091954
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -214        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 58          |
|    time_elapsed         | 1015        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006418921 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.11       |
|    explained_variance   | 0.358       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 3.3 

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 207.85353535353536
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -160         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 66           |
|    time_elapsed         | 1154         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0055451533 |
|    clip_fraction        | 0.0843       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.87        |
|    explained_variance   | 0.433        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.69         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.03        |
|    value_

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 213.80743243243242
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -107        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 74          |
|    time_elapsed         | 1306        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.004815311 |
|    clip_fraction        | 0.0799      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.87       |
|    explained_variance   | 0.543       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.03       |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 218.78252032520325
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -61.1       |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 82          |
|    time_elapsed         | 1456        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006024363 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.8        |
|    explained_variance   | 0.53        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0315     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 223.17777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -27.7       |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 90          |
|    time_elapsed         | 1608        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005415203 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.82       |
|    explained_variance   | 0.555       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0342     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 226.98044217687075
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -6.28       |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 98          |
|    time_elapsed         | 1760        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008548133 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.91       |
|    explained_variance   | 0.505       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.037      |
|    value_loss           | 3.

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -346.0
Standard deviation of reward: 0.0
Average successful assignments: 24.666666666666668
All assignments history: [12, 14, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 62       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 16.708333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007665269 |
|    clip_fraction        | 0.0678      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.23       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.24        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0394     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 85.71666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 10          |
|    time_elapsed         | 187         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010925803 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | -0.000376   |
|    learning_rate        | 0.00018     |
|    loss                 | 0.578       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0463     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 124.69444444444444
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 18          |
|    time_elapsed         | 338         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010402429 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.157       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.173       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0459     |
|    value_loss           | 2.74

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 141.46794871794873
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -346         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 26           |
|    time_elapsed         | 486          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0078095063 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.44        |
|    explained_variance   | 0.209        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.862        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0414      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 153.08823529411765
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -326         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 34           |
|    time_elapsed         | 632          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0076052574 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.86        |
|    explained_variance   | 0.318        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.44         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0446      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 162.81150793650792
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -297        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 42          |
|    time_elapsed         | 783         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006483592 |
|    clip_fraction        | 0.0859      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.592       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0358     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 172.43
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -270         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 50           |
|    time_elapsed         | 935          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0063365055 |
|    clip_fraction        | 0.114        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.82        |
|    explained_variance   | 0.65         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.03         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0393      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 182.0272988505747
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -223       |
| time/                   |            |
|    fps                  | 54         |
|    iterations           | 58         |
|    time_elapsed         | 1086       |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.00625679 |
|    clip_fraction        | 0.129      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.47      |
|    explained_variance   | 0.626      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.73       |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0411    |
|    value_loss           | 3.37       |
---------

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 190.35858585858585
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 66          |
|    time_elapsed         | 1240        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006935148 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.42       |
|    explained_variance   | 0.652       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.956       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0375     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 197.45495495495496
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -128        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 74          |
|    time_elapsed         | 1384        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007228287 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.3        |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0393     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 203.5111788617886
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -89.5       |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 82          |
|    time_elapsed         | 1525        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.007019949 |
|    clip_fraction        | 0.141       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.23       |
|    explained_variance   | 0.421       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 208.63055555555556
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -62.1        |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 90           |
|    time_elapsed         | 1677         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0072122207 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.27        |
|    explained_variance   | 0.387        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.23         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0385      |
|    value_

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 212.87414965986395
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -46.1        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 98           |
|    time_elapsed         | 1824         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0075895973 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.2         |
|    explained_variance   | 0.419        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.27         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.038       |
|    value_

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 8.916666666666666
All assignments history: [13, 14, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 60       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -80.0
Standard deviation of reward: 0.0
Average successful assignments: 72.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -376        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 2           |
|    time_elapsed         | 36          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008233357 |
|    clip_fraction        | 0.0938      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.175      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.58        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0412     |
|    value_loss           | 17.4        |
--

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 130.86666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 184         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010216125 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00525     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0414     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 146.625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 331         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009959218 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.64       |
|    explained_variance   | 0.17        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.567       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 2.84        |


-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 154.40705128205127
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -343      |
| time/                   |           |
|    fps                  | 55        |
|    iterations           | 26        |
|    time_elapsed         | 478       |
|    total_timesteps      | 26624     |
| train/                  |           |
|    approx_kl            | 0.0084115 |
|    clip_fraction        | 0.161     |
|    clip_range           | 0.15      |
|    entropy_loss         | -5.35     |
|    explained_variance   | 0.233     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.489     |
|    n_updates            | 250       |
|    policy_gradient_loss | -0.0412   |
|    value_loss           | 2.95      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 160.1813725490196
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -322        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 34          |
|    time_elapsed         | 615         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008472336 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.73       |
|    explained_variance   | 0.338       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.56        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0401     |
|    value_loss           | 3.4  

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 168.98809523809524
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -295        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 42          |
|    time_elapsed         | 747         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007862419 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.29       |
|    explained_variance   | 0.385       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0444     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 178.64666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -269         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 50           |
|    time_elapsed         | 878          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0085256975 |
|    clip_fraction        | 0.167        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.94        |
|    explained_variance   | 0.53         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.716        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0418      |
|    value_

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 186.81034482758622
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -227         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 58           |
|    time_elapsed         | 1009         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0071220067 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.63        |
|    explained_variance   | 0.49         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.1          |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0376      |
|    value_

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 194.3901515151515
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -181         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 66           |
|    time_elapsed         | 1137         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0069650803 |
|    clip_fraction        | 0.113        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.35        |
|    explained_variance   | 0.536        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.27         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0343      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 201.22747747747746
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 74          |
|    time_elapsed         | 1253        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006183902 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.29       |
|    explained_variance   | 0.474       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0349     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 207.2510162601626
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -98.5        |
| time/                   |              |
|    fps                  | 61           |
|    iterations           | 82           |
|    time_elapsed         | 1366         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0067486856 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.398        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.38         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.035       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 212.51666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -67.5        |
| time/                   |              |
|    fps                  | 62           |
|    iterations           | 90           |
|    time_elapsed         | 1477         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0060162274 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.95        |
|    explained_variance   | 0.419        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.856        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0364      |
|    value_

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 217.13180272108843
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -43.9        |
| time/                   |              |
|    fps                  | 63           |
|    iterations           | 98           |
|    time_elapsed         | 1592         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0075196456 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.86        |
|    explained_variance   | 0.392        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.18         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0351      |
|    value_

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 14.666666666666666
All assignments history: [16, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 75       |
|    iterations      | 1        |
|    time_elapsed    | 13       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -360.0
Standard deviation of reward: 0.0
Average successful assignments: 16.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007775809 |
|    clip_fraction        | 0.0881      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.3        |
|    learning_rate        | 0.00018     |
|    loss                 | 2.17        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0412     |
|    value_loss           | 16.3        |


-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 81.64166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 138         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010708357 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | -0.00324    |
|    learning_rate        | 0.00018     |
|    loss                 | 1.08        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0408     |
|    value_loss           | 4.1

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 119.81944444444444
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -360       |
| time/                   |            |
|    fps                  | 72         |
|    iterations           | 18         |
|    time_elapsed         | 253        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.00961812 |
|    clip_fraction        | 0.184      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.69      |
|    explained_variance   | 0.185      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.17       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0427    |
|    value_loss           | 2.74       |
---------

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 140.0096153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 26          |
|    time_elapsed         | 366         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008857945 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.47       |
|    explained_variance   | 0.317       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.54        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0444     |
|    value_loss           | 3   

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 153.34558823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 69          |
|    iterations           | 34          |
|    time_elapsed         | 498         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009122852 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.01       |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 165.30555555555554
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -304        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 42          |
|    time_elapsed         | 606         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007920412 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.4        |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.777       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0429     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 176.62833333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -276       |
| time/                   |            |
|    fps                  | 70         |
|    iterations           | 50         |
|    time_elapsed         | 727        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.00843236 |
|    clip_fraction        | 0.14       |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.86      |
|    explained_variance   | 0.481      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.909      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0406    |
|    value_loss           | 3.51       |
--------

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 186.93965517241378
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -229         |
| time/                   |              |
|    fps                  | 70           |
|    iterations           | 58           |
|    time_elapsed         | 838          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0056863762 |
|    clip_fraction        | 0.0956       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.5         |
|    explained_variance   | 0.497        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0324      |
|    value_

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 195.8409090909091
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -179         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 66           |
|    time_elapsed         | 943          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0054735793 |
|    clip_fraction        | 0.0882       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.29        |
|    explained_variance   | 0.589        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0319      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 203.30180180180182
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -129         |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 74           |
|    time_elapsed         | 1047         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0060331402 |
|    clip_fraction        | 0.113        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.06        |
|    explained_variance   | 0.517        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.55         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0338      |
|    value_

-------- Rollout Summary --------
Total mean reward: 178.0
Standard deviation of reward: 0.0
Average successful assignments: 209.89735772357724
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -85.1        |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 82           |
|    time_elapsed         | 1152         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0057814894 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.04        |
|    explained_variance   | 0.415        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.75         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0349      |
|    value_

-------- Rollout Summary --------
Total mean reward: 182.0
Standard deviation of reward: 0.0
Average successful assignments: 215.73611111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -47.8        |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 90           |
|    time_elapsed         | 1263         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0060715307 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.91        |
|    explained_variance   | 0.523        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0358      |
|    value_

-------- Rollout Summary --------
Total mean reward: 184.0
Standard deviation of reward: 0.0
Average successful assignments: 220.72704081632654
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -21.4        |
| time/                   |              |
|    fps                  | 73           |
|    iterations           | 98           |
|    time_elapsed         | 1368         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0056274915 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.86        |
|    explained_variance   | 0.538        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.48         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.037       |
|    value_

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 9.166666666666666
All assignments history: [16, 14, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -370     |
| time/              |          |
|    fps             | 84       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 11.916666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 82           |
|    iterations           | 2            |
|    time_elapsed         | 24           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0068474626 |
|    clip_fraction        | 0.0593       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.217       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.65         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.039       |
|    value

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 114.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 127         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009595756 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | -0.000342   |
|    learning_rate        | 0.00018     |
|    loss                 | 0.942       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0414     |
|    value_loss           | 3.93        |
-

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 139.26388888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 18          |
|    time_elapsed         | 228         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009518366 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.72       |
|    explained_variance   | 0.0564      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.81        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0441     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 154.83333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -349         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 26           |
|    time_elapsed         | 326          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0062005967 |
|    clip_fraction        | 0.0933       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.51        |
|    explained_variance   | 0.301        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.314        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0372      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 164.46323529411765
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -332        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 34          |
|    time_elapsed         | 413         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008067076 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.03       |
|    explained_variance   | 0.361       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.62        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0448     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 175.390873015873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -307        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 42          |
|    time_elapsed         | 485         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009650771 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.44       |
|    explained_variance   | 0.287       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 3.03

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 186.83666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -279        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 50          |
|    time_elapsed         | 533         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007874096 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.86       |
|    explained_variance   | 0.235       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.74        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 196.79741379310346
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -231         |
| time/                   |              |
|    fps                  | 102          |
|    iterations           | 58           |
|    time_elapsed         | 576          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0054025576 |
|    clip_fraction        | 0.1          |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.36        |
|    explained_variance   | 0.378        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.73         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.036       |
|    value_

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 205.2348484848485
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 66          |
|    time_elapsed         | 611         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.004979445 |
|    clip_fraction        | 0.0927      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.05       |
|    explained_variance   | 0.48        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0312     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 176.0
Standard deviation of reward: 0.0
Average successful assignments: 212.43355855855856
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 117         |
|    iterations           | 74          |
|    time_elapsed         | 646         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.004991723 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.84       |
|    explained_variance   | 0.468       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.58        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.029      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 178.0
Standard deviation of reward: 0.0
Average successful assignments: 218.53252032520325
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -74.3       |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 82          |
|    time_elapsed         | 680         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004657779 |
|    clip_fraction        | 0.0715      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.77       |
|    explained_variance   | 0.601       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0288     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 180.0
Standard deviation of reward: 0.0
Average successful assignments: 223.71666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -34.9       |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 90          |
|    time_elapsed         | 715         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005556278 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.73       |
|    explained_variance   | 0.667       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.71        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0302     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 180.0
Standard deviation of reward: 0.0
Average successful assignments: 228.21428571428572
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -6.86       |
| time/                   |             |
|    fps                  | 133         |
|    iterations           | 98          |
|    time_elapsed         | 750         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006439415 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.68       |
|    explained_variance   | 0.619       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.035      |
|    value_loss           | 3.

In [19]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -386.0
Standard deviation of reward: 0.0
Average successful assignments: 7.583333333333333
All assignments history: [8, 13, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -379     |
| time/              |          |
|    fps             | 258      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -336.0
Standard deviation of reward: 0.0
Average successful assignments: 18.291666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -376         |
| time/                   |              |
|    fps                  | 248          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0065355394 |
|    clip_fraction        | 0.0546       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.0634      |
|    learning_rate        | 0.00018      |
|    loss                 | 4.08         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0363      |
|    value

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 124.83333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008499581 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.0021      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.78        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0383     |
|    value_loss           | 4.6

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 146.02777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 18          |
|    time_elapsed         | 76          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010954897 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.118       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.36        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 161.5897435897436
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 26          |
|    time_elapsed         | 110         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007585579 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.41       |
|    explained_variance   | 0.384       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.396       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 2.92

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 172.9558823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -325        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008307281 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.9        |
|    explained_variance   | 0.588       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.875       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0441     |
|    value_loss           | 2.92

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 182.06349206349208
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -298         |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 42           |
|    time_elapsed         | 179          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0069199214 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.31        |
|    explained_variance   | 0.749        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.01         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0389      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 190.38333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -271        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 50          |
|    time_elapsed         | 213         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008613871 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.97       |
|    explained_variance   | 0.685       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 198.08764367816093
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -227         |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 58           |
|    time_elapsed         | 247          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0071947942 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.68        |
|    explained_variance   | 0.526        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0435      |
|    value_

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 204.5568181818182
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 240          |
|    iterations           | 66           |
|    time_elapsed         | 281          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0076054307 |
|    clip_fraction        | 0.149        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.49        |
|    explained_variance   | 0.491        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.11         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0422      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 210.13738738738738
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -138        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 74          |
|    time_elapsed         | 315         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006428404 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.28       |
|    explained_variance   | 0.327       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.911       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 214.9908536585366
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -96.9       |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 82          |
|    time_elapsed         | 350         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006243171 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.21       |
|    explained_variance   | 0.344       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.63        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0348     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 219.06203703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -70.1       |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 90          |
|    time_elapsed         | 385         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.007670781 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.22       |
|    explained_variance   | 0.375       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.31        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0355     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 222.70238095238096
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -51.5        |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 98           |
|    time_elapsed         | 419          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0061965166 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.18        |
|    explained_variance   | 0.482        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.6          |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0384      |
|    value_

In [20]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 13.083333333333334
All assignments history: [17, 20, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -363     |
| time/              |          |
|    fps             | 259      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -326.0
Standard deviation of reward: 0.0
Average successful assignments: 23.041666666666668
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -368       |
| time/                   |            |
|    fps                  | 251        |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00804497 |
|    clip_fraction        | 0.0796     |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.84      |
|    explained_variance   | 0.0128     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.26       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0411    |
|    value_loss           | 15.7       |
-------

-------- Rollout Summary --------
Total mean reward: -132.0
Standard deviation of reward: 0.0
Average successful assignments: 62.43333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 242         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009175162 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00785     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.562       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0403     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 103.8425925925926
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -358         |
| time/                   |              |
|    fps                  | 240          |
|    iterations           | 18           |
|    time_elapsed         | 76           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0114674885 |
|    clip_fraction        | 0.247        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.71        |
|    explained_variance   | 0.0821       |
|    learning_rate        | 0.00018      |
|    loss                 | 0.189        |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0504      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 132.04807692307693
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 26          |
|    time_elapsed         | 111         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008277377 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.51       |
|    explained_variance   | 0.191       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.559       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 153.0171568627451
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 34          |
|    time_elapsed         | 146         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008578155 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.1        |
|    explained_variance   | 0.32        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 168.63492063492063
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -303        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 42          |
|    time_elapsed         | 180         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009347815 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.46       |
|    explained_variance   | 0.304       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 181.67333333333335
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -273         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 50           |
|    time_elapsed         | 215          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0076087946 |
|    clip_fraction        | 0.155        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.86        |
|    explained_variance   | 0.343        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.57         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0454      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 192.54022988505747
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -224         |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 58           |
|    time_elapsed         | 250          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0067704124 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.38        |
|    explained_variance   | 0.421        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.55         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0348      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 201.18055555555554
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -172         |
| time/                   |              |
|    fps                  | 235          |
|    iterations           | 66           |
|    time_elapsed         | 286          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0068969186 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.08        |
|    explained_variance   | 0.377        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.76         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0373      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 208.2668918918919
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -120        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 74          |
|    time_elapsed         | 322         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006283354 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.98       |
|    explained_variance   | 0.442       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.48        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0344     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 213.92174796747966
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -74.2       |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 82          |
|    time_elapsed         | 357         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005772685 |
|    clip_fraction        | 0.0888      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3          |
|    explained_variance   | 0.466       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0314     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 172.0
Standard deviation of reward: 0.0
Average successful assignments: 218.88611111111112
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -39.7       |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 90          |
|    time_elapsed         | 393         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009251997 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.07       |
|    explained_variance   | 0.434       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0335     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 180.0
Standard deviation of reward: 0.0
Average successful assignments: 223.40391156462584
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -19.4        |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 98           |
|    time_elapsed         | 428          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0073408354 |
|    clip_fraction        | 0.133        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.538        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.32         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0344      |
|    value_

In [21]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.583333333333333
All assignments history: [14, 17, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 257      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -280.0
Standard deviation of reward: 0.0
Average successful assignments: 30.25
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -367         |
| time/                   |              |
|    fps                  | 242          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0075401887 |
|    clip_fraction        | 0.0791       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | 0.00523      |
|    learning_rate        | 0.00018      |
|    loss                 | 1.8          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0404      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 108.20833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008933647 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.0189      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.036      |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 134.49074074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -354        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009539155 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.65       |
|    explained_variance   | 0.265       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.136       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0453     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 151.46794871794873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -341        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 26          |
|    time_elapsed         | 113         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008153148 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.35       |
|    explained_variance   | 0.38        |
|    learning_rate        | 0.00018     |
|    loss                 | 2.03        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 164.6691176470588
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -318       |
| time/                   |            |
|    fps                  | 233        |
|    iterations           | 34         |
|    time_elapsed         | 148        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00962557 |
|    clip_fraction        | 0.203      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.81      |
|    explained_variance   | 0.239      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.898      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0504    |
|    value_loss           | 2.79       |
----------

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 176.07738095238096
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -293        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 42          |
|    time_elapsed         | 183         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008267407 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.27       |
|    explained_variance   | 0.165       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 187.04666666666665
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -265         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 50           |
|    time_elapsed         | 219          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0071818316 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.78        |
|    explained_variance   | 0.236        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0408      |
|    value_

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 196.69683908045977
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -219        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 58          |
|    time_elapsed         | 253         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.005476252 |
|    clip_fraction        | 0.0873      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.51       |
|    explained_variance   | 0.278       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0325     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 204.69823232323233
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -170         |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 66           |
|    time_elapsed         | 285          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0071403887 |
|    clip_fraction        | 0.134        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.2         |
|    explained_variance   | 0.321        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0391      |
|    value_

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 211.28378378378378
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -122         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 74           |
|    time_elapsed         | 317          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0076523917 |
|    clip_fraction        | 0.141        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.01        |
|    explained_variance   | 0.376        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.36         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0362      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 216.6392276422764
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -79.4        |
| time/                   |              |
|    fps                  | 242          |
|    iterations           | 82           |
|    time_elapsed         | 345          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0070419405 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.9         |
|    explained_variance   | 0.409        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.49         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.037       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 221.05
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -46.5        |
| time/                   |              |
|    fps                  | 245          |
|    iterations           | 90           |
|    time_elapsed         | 374          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0067416234 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.84        |
|    explained_variance   | 0.413        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.78         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0339      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 224.86479591836735
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -23.1      |
| time/                   |            |
|    fps                  | 250        |
|    iterations           | 98         |
|    time_elapsed         | 400        |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00801545 |
|    clip_fraction        | 0.161      |
|    clip_range           | 0.15       |
|    entropy_loss         | -2.89      |
|    explained_variance   | 0.344      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.37       |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0406    |
|    value_loss           | 3.23       |
--------