In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_ridge_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    ridge_model = Ridge()
    ridge_model.fit(X_train_scaled, y_train)
    return ridge_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.1.csv')
ridge_model, scaler = train_ridge_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.1.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = ridge_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.8967544998516832
RMSE: 1.1358026956536529
R-squared: 0.991274663358528
RAE: 0.09365725311592739


In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -386.0
Standard deviation of reward: 0.0
Average successful assignments: 7.833333333333333
All assignments history: [13, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -376     |
| time/              |          |
|    fps             | 70       |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -288.0
Standard deviation of reward: 0.0
Average successful assignments: 28.291666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -376         |
| time/                   |              |
|    fps                  | 66           |
|    iterations           | 2            |
|    time_elapsed         | 30           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0071727214 |
|    clip_fraction        | 0.0669       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.287       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.3          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.04        |
|    value

-------- Rollout Summary --------
Total mean reward: -116.0
Standard deviation of reward: 0.0
Average successful assignments: 82.175
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 10          |
|    time_elapsed         | 175         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009944873 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.0141      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.668       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0423     |
|    value_loss           | 3.64        |

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 125.04166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 18          |
|    time_elapsed         | 325         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010806534 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.208       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.243       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0448     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 146.31410256410257
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 26          |
|    time_elapsed         | 481         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009655811 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.48       |
|    explained_variance   | 0.299       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.37        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0489     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 159.3406862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -330        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 34          |
|    time_elapsed         | 628         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009014888 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.469       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.644       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 2.86

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 169.3531746031746
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -309         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 42           |
|    time_elapsed         | 788          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0079235695 |
|    clip_fraction        | 0.167        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.52        |
|    explained_variance   | 0.518        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.31         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0492      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 178.63333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -284         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 50           |
|    time_elapsed         | 942          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0086539835 |
|    clip_fraction        | 0.149        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.03        |
|    explained_variance   | 0.423        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.927        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0426      |
|    value_

-------- Rollout Summary --------
Total mean reward: 114.0
Standard deviation of reward: 0.0
Average successful assignments: 186.55172413793105
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -240        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 58          |
|    time_elapsed         | 1108        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008016695 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.12        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.989       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0401     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 193.47348484848484
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -196        |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 66          |
|    time_elapsed         | 1277        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006353104 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.47       |
|    explained_variance   | 0.346       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.5         |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0373     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 199.28153153153153
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -153         |
| time/                   |              |
|    fps                  | 52           |
|    iterations           | 74           |
|    time_elapsed         | 1447         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0065254383 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.46        |
|    explained_variance   | 0.411        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.55         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0421      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 204.28760162601625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -115        |
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 82          |
|    time_elapsed         | 1618        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006610384 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.3        |
|    explained_variance   | 0.555       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 208.74537037037038
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -82.1        |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 90           |
|    time_elapsed         | 1789         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0059884763 |
|    clip_fraction        | 0.105        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.19        |
|    explained_variance   | 0.585        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.05         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0349      |
|    value_

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 212.70238095238096
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -57.6        |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 98           |
|    time_elapsed         | 1950         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0064087743 |
|    clip_fraction        | 0.105        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.06        |
|    explained_variance   | 0.599        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.27         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0335      |
|    value_

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 8.833333333333334
All assignments history: [9, 17, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 60       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -372.0
Standard deviation of reward: 0.0
Average successful assignments: 11.833333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007867735 |
|    clip_fraction        | 0.082       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.0525     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.07        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0407     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 98.78333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 10          |
|    time_elapsed         | 186         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010394647 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00621     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.56        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 3.7 

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 133.99537037037038
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -359       |
| time/                   |            |
|    fps                  | 54         |
|    iterations           | 18         |
|    time_elapsed         | 339        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.00910845 |
|    clip_fraction        | 0.181      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.69      |
|    explained_variance   | 0.163      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.367      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0451    |
|    value_loss           | 2.9        |
---------

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 148.38141025641025
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 26          |
|    time_elapsed         | 485         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007987559 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.47       |
|    explained_variance   | 0.325       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.366       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.042      |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 157.99754901960785
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 34          |
|    time_elapsed         | 635         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.006762107 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.94       |
|    explained_variance   | 0.328       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 166.62698412698413
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -302         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 42           |
|    time_elapsed         | 783          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0064862063 |
|    clip_fraction        | 0.0999       |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.25        |
|    explained_variance   | 0.489        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0381      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 174.305
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -276        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 50          |
|    time_elapsed         | 931         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007819944 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.86       |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 2.96        |


-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 181.13074712643677
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -233        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 58          |
|    time_elapsed         | 1071        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007696846 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.64       |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0398     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 187.65277777777777
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -190         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 66           |
|    time_elapsed         | 1209         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0070435237 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.57        |
|    explained_variance   | 0.519        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.08         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0398      |
|    value_

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 193.48085585585585
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -149         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 74           |
|    time_elapsed         | 1344         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0064381836 |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.5         |
|    explained_variance   | 0.481        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.03         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0389      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 198.7428861788618
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -114         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 82           |
|    time_elapsed         | 1481         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0071850903 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.4         |
|    explained_variance   | 0.421        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.879        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0369      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 203.5675925925926
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -87.8        |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 90           |
|    time_elapsed         | 1618         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0077079246 |
|    clip_fraction        | 0.153        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.397        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.14         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0401      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 207.71428571428572
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -67.8        |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 98           |
|    time_elapsed         | 1761         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0066422047 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.403        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.17         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0368      |
|    value_

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -390.0
Standard deviation of reward: 0.0
Average successful assignments: 6.5
All assignments history: [13, 15, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 69       |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -324.0
Standard deviation of reward: 0.0
Average successful assignments: 20.416666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 65          |
|    iterations           | 2           |
|    time_elapsed         | 31          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007000914 |
|    clip_fraction        | 0.0661      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.249      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.11        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0375     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 96.55
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 10          |
|    time_elapsed         | 166         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010392476 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | -0.000429   |
|    learning_rate        | 0.00018     |
|    loss                 | 0.926       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 3.79        |
-

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 132.6064814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 18          |
|    time_elapsed         | 300         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009560424 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.196       |
|    learning_rate        | 0.00018     |
|    loss                 | -0.0457     |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 2.74

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 148.2051282051282
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 26          |
|    time_elapsed         | 437         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.006833949 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.353       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.613       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.036      |
|    value_loss           | 3.26

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 157.6299019607843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -327        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 34          |
|    time_elapsed         | 576         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008109254 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.471       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.786       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 2.84

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 166.81150793650792
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -301        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 42          |
|    time_elapsed         | 714         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007829843 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.25       |
|    explained_variance   | 0.663       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 174.77166666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -274         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 50           |
|    time_elapsed         | 854          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0067533194 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.83        |
|    explained_variance   | 0.676        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.21         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.039       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 182.40086206896552
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -230        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 58          |
|    time_elapsed         | 991         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007815592 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.67       |
|    explained_variance   | 0.534       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.808       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0445     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 188.97222222222223
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -187         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 66           |
|    time_elapsed         | 1129         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0064097214 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.53        |
|    explained_variance   | 0.432        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.38         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0356      |
|    value_

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 195.39977477477478
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -144        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 74          |
|    time_elapsed         | 1269        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006554257 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.39       |
|    explained_variance   | 0.343       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0365     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 201.08130081300814
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 82          |
|    time_elapsed         | 1418        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006383815 |
|    clip_fraction        | 0.1         |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.27       |
|    explained_variance   | 0.434       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0308     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 205.87314814814815
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -80.8        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1565         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0053526387 |
|    clip_fraction        | 0.0959       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.23        |
|    explained_variance   | 0.496        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.38         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0326      |
|    value_

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 210.1377551020408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -61.8       |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 98          |
|    time_elapsed         | 1717        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005628464 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.11       |
|    explained_variance   | 0.582       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.52        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0315     |
|    value_loss           | 3.5

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 9.0
All assignments history: [16, 12, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 57       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -318.0
Standard deviation of reward: 0.0
Average successful assignments: 22.541666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007904239 |
|    clip_fraction        | 0.0823      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.0857     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.43        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -46.0
Standard deviation of reward: 0.0
Average successful assignments: 88.04166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 10          |
|    time_elapsed         | 181         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010320766 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.029       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0406     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 127.77777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 330         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009921525 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.236       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.429       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0423     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 144.8846153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -349        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 26          |
|    time_elapsed         | 479         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009118572 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.356       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 2.55

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 155.78676470588235
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 34          |
|    time_elapsed         | 622         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008189232 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.92       |
|    explained_variance   | 0.444       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.047      |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 164.69642857142858
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -303        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 42          |
|    time_elapsed         | 769         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008213516 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.33       |
|    explained_variance   | 0.608       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.795       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0403     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 174.685
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -278         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 50           |
|    time_elapsed         | 917          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0077950153 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.96        |
|    explained_variance   | 0.392        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.937        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0384      |
|    value_loss       

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 184.05603448275863
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -234         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 58           |
|    time_elapsed         | 1069         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0070822486 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.58        |
|    explained_variance   | 0.33         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.22         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0381      |
|    value_

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 192.01388888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -189        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 66          |
|    time_elapsed         | 1218        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006037183 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.38       |
|    explained_variance   | 0.44        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.034      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 198.5990990990991
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -143         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 74           |
|    time_elapsed         | 1371         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0056055877 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.23        |
|    explained_variance   | 0.467        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.48         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0345      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 203.8739837398374
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -105        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 82          |
|    time_elapsed         | 1511        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006060361 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.12       |
|    explained_variance   | 0.541       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0343     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 208.48611111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -75         |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 90          |
|    time_elapsed         | 1652        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005748542 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.17       |
|    explained_variance   | 0.612       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.75        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.031      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 212.4778911564626
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -54.9        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 98           |
|    time_elapsed         | 1801         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0061273603 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.671        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.75         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0317      |
|    value_l

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 6.0
All assignments history: [10, 22, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -368     |
| time/              |          |
|    fps             | 56       |
|    iterations      | 1        |
|    time_elapsed    | 18       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -200.0
Standard deviation of reward: 0.0
Average successful assignments: 45.541666666666664
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -374         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 2            |
|    time_elapsed         | 36           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0065878863 |
|    clip_fraction        | 0.0593       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.246       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.45         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0389      |
|    value

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 132.00833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 184         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009183964 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.0123      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.668       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.039      |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 149.8101851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 332         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010731483 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.65       |
|    explained_variance   | 0.19        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.703       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 2.72

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 160.64102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 26          |
|    time_elapsed         | 479         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.006801374 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.217       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.778       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.039      |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 168.2156862745098
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -326         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 34           |
|    time_elapsed         | 623          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0072476217 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.81        |
|    explained_variance   | 0.125        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0416      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 174.51785714285714
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -299        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 42          |
|    time_elapsed         | 760         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.005522413 |
|    clip_fraction        | 0.0847      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.13       |
|    explained_variance   | 0.147       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0352     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 180.095
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -272         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 50           |
|    time_elapsed         | 890          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0062248055 |
|    clip_fraction        | 0.106        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.82        |
|    explained_variance   | 0.211        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.883        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0369      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 185.90086206896552
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -231         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 58           |
|    time_elapsed         | 1018         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0064009773 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.58        |
|    explained_variance   | 0.303        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.14         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0402      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 190.989898989899
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 66          |
|    time_elapsed         | 1148        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.005550173 |
|    clip_fraction        | 0.0841      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.51       |
|    explained_variance   | 0.366       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.67        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0333     |
|    value_loss           | 3.64 

-------- Rollout Summary --------
Total mean reward: 106.0
Standard deviation of reward: 0.0
Average successful assignments: 195.6509009009009
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -147         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 74           |
|    time_elapsed         | 1275         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0074138884 |
|    clip_fraction        | 0.119        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.42        |
|    explained_variance   | 0.497        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0388      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 200.22459349593495
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 82          |
|    time_elapsed         | 1390        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.007175516 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.28       |
|    explained_variance   | 0.493       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0352     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 204.50185185185185
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -88.3        |
| time/                   |              |
|    fps                  | 61           |
|    iterations           | 90           |
|    time_elapsed         | 1502         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0053277137 |
|    clip_fraction        | 0.0983       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.32        |
|    explained_variance   | 0.569        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.39         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0347      |
|    value_

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 208.52891156462584
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -74.3       |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 98          |
|    time_elapsed         | 1612        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009181127 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.39       |
|    explained_variance   | 0.698       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0387     |
|    value_loss           | 3.

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.75
All assignments history: [19, 20, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -361     |
| time/              |          |
|    fps             | 79       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -308.0
Standard deviation of reward: 0.0
Average successful assignments: 25.75
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -366         |
| time/                   |              |
|    fps                  | 76           |
|    iterations           | 2            |
|    time_elapsed         | 26           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076146424 |
|    clip_fraction        | 0.0733       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.163       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.85         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.04        |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: -102.0
Standard deviation of reward: 0.0
Average successful assignments: 84.2
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 10          |
|    time_elapsed         | 138         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010421181 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00993     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0435     |
|    value_loss           | 4.11        |
-

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 107.24074074074075
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 247         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011276968 |
|    clip_fraction        | 0.235       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.167       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.225       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0463     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 128.35897435897436
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -349        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 26          |
|    time_elapsed         | 361         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008237323 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.44       |
|    explained_variance   | 0.365       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0396     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 143.73039215686273
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -331        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 34          |
|    time_elapsed         | 471         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009362513 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.94       |
|    explained_variance   | 0.418       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 155.24007936507937
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -307        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 42          |
|    time_elapsed         | 603         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007315307 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.26       |
|    explained_variance   | 0.418       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 164.32333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -281        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 50          |
|    time_elapsed         | 709         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006871123 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0396     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 172.2183908045977
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -240        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 58          |
|    time_elapsed         | 826         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006553691 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.63       |
|    explained_variance   | 0.307       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0354     |
|    value_loss           | 3.17

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 179.18813131313132
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -195         |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 66           |
|    time_elapsed         | 937          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0064671496 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.46        |
|    explained_variance   | 0.366        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.25         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0333      |
|    value_

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 185.45495495495496
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -154        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 74          |
|    time_elapsed         | 1039        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005468292 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.36       |
|    explained_variance   | 0.392       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0347     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 190.72459349593495
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -117        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 82          |
|    time_elapsed         | 1141        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006885047 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.24       |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 195.46481481481482
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -89.7       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 90          |
|    time_elapsed         | 1243        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006233669 |
|    clip_fraction        | 0.0968      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.2        |
|    explained_variance   | 0.511       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.41        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0318     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 199.56037414965985
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -72         |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 98          |
|    time_elapsed         | 1350        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005782791 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.12       |
|    explained_variance   | 0.567       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.58        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0343     |
|    value_loss           | 3.

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.0
All assignments history: [15, 15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -370     |
| time/              |          |
|    fps             | 80       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -346.0
Standard deviation of reward: 0.0
Average successful assignments: 17.25
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007617446 |
|    clip_fraction        | 0.0763      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.376      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.22        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0377     |
|    value_loss           | 16.8        |


-------- Rollout Summary --------
Total mean reward: -34.0
Standard deviation of reward: 0.0
Average successful assignments: 104.49166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 127         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010157961 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.0108      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.3         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0422     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 128.86574074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 18          |
|    time_elapsed         | 229         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010085449 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.108       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.558       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 140.2403846153846
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 26          |
|    time_elapsed         | 332         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008208133 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.214       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.46        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0401     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 151.23039215686273
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 34          |
|    time_elapsed         | 432         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007678018 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.29        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 160.71428571428572
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -303        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 42          |
|    time_elapsed         | 521         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007342649 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.29       |
|    explained_variance   | 0.31        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0452     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 168.48
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -275        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 50          |
|    time_elapsed         | 598         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006779015 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.88       |
|    explained_variance   | 0.279       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.796       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 3.07        |
-

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 176.46264367816093
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -233        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 58          |
|    time_elapsed         | 650         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006219098 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.64       |
|    explained_variance   | 0.24        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0358     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 184.22727272727272
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -190         |
| time/                   |              |
|    fps                  | 96           |
|    iterations           | 66           |
|    time_elapsed         | 697          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0059617497 |
|    clip_fraction        | 0.0915       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.59        |
|    explained_variance   | 0.345        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0332      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 191.3536036036036
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -146        |
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 74          |
|    time_elapsed         | 732         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005427847 |
|    clip_fraction        | 0.092       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.36       |
|    explained_variance   | 0.379       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0343     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 197.33434959349594
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -106         |
| time/                   |              |
|    fps                  | 109          |
|    iterations           | 82           |
|    time_elapsed         | 766          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0056613125 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.19        |
|    explained_variance   | 0.469        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.814        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0342      |
|    value_

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 202.64074074074074
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -76.1        |
| time/                   |              |
|    fps                  | 114          |
|    iterations           | 90           |
|    time_elapsed         | 801          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0054749874 |
|    clip_fraction        | 0.0981       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.493        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.858        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0312      |
|    value_

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 207.32908163265307
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -56.9        |
| time/                   |              |
|    fps                  | 120          |
|    iterations           | 98           |
|    time_elapsed         | 836          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0063837413 |
|    clip_fraction        | 0.102        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.15        |
|    explained_variance   | 0.557        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.42         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0343      |
|    value_

In [19]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -378.0
Standard deviation of reward: 0.0
Average successful assignments: 10.916666666666666
All assignments history: [14, 7, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -379     |
| time/              |          |
|    fps             | 264      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -374.0
Standard deviation of reward: 0.0
Average successful assignments: 12.208333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 248         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007118349 |
|    clip_fraction        | 0.0672      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | 0.0445      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.58        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0358     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -130.0
Standard deviation of reward: 0.0
Average successful assignments: 70.475
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 241         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010112205 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.76       |
|    explained_variance   | 0.0106      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.7         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0422     |
|    value_loss           | 3.91        |

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 115.00462962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.008849563 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.66       |
|    explained_variance   | 0.26        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.153       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0369     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 135.82371794871796
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 26          |
|    time_elapsed         | 111         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007542768 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.42       |
|    explained_variance   | 0.41        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.703       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 147.85539215686273
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008810377 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.563       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.918       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0461     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 157.60714285714286
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -302        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 42          |
|    time_elapsed         | 180         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008171134 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.3        |
|    explained_variance   | 0.5         |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0422     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 165.68333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -278         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 50           |
|    time_elapsed         | 214          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0071962764 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.01        |
|    explained_variance   | 0.303        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.41         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0392      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 173.69827586206895
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -236         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 58           |
|    time_elapsed         | 248          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0074902065 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.75        |
|    explained_variance   | 0.393        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.68         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0349      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 181.07449494949495
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -195       |
| time/                   |            |
|    fps                  | 238        |
|    iterations           | 66         |
|    time_elapsed         | 282        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.00782267 |
|    clip_fraction        | 0.133      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.68      |
|    explained_variance   | 0.265      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.09       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0399    |
|    value_loss           | 3.03       |
--------

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 187.70045045045046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -155        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 74          |
|    time_elapsed         | 317         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007689454 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.6        |
|    explained_variance   | 0.3         |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 193.41869918699186
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -123         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 82           |
|    time_elapsed         | 351          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0067444546 |
|    clip_fraction        | 0.142        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.46        |
|    explained_variance   | 0.423        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0421      |
|    value_

-------- Rollout Summary --------
Total mean reward: 130.0
Standard deviation of reward: 0.0
Average successful assignments: 198.46203703703705
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -96.6        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 90           |
|    time_elapsed         | 385          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0063813133 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.39        |
|    explained_variance   | 0.472        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.976        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0364      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 202.88095238095238
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -78.4        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 98           |
|    time_elapsed         | 420          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0071963863 |
|    clip_fraction        | 0.133        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.31        |
|    explained_variance   | 0.553        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0374      |
|    value_

In [20]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -374.0
Standard deviation of reward: 0.0
Average successful assignments: 13.416666666666666
All assignments history: [13, 18, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 271      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.458333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -376        |
| time/                   |             |
|    fps                  | 256         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007829893 |
|    clip_fraction        | 0.0857      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.249      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.35        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0387     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 96.075
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 242         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010605672 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.76       |
|    explained_variance   | 0.00894     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.316       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0426     |
|    value_loss           | 3.88        |


-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 125.70370370370371
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 18          |
|    time_elapsed         | 76          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.008102678 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.14        |
|    learning_rate        | 0.00018     |
|    loss                 | 3.19        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0394     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 139.69551282051282
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -351         |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 26           |
|    time_elapsed         | 110          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0074930093 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.49        |
|    explained_variance   | 0.317        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.09         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0399      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 151.9191176470588
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -334        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007714104 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.95       |
|    explained_variance   | 0.31        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.043      |
|    value_loss           | 2.82

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 162.92261904761904
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -308        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 42          |
|    time_elapsed         | 180         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009041791 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.444       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.41        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0476     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 172.11666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -282        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 50          |
|    time_elapsed         | 214         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008363098 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.92       |
|    explained_variance   | 0.521       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0429     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 180.10632183908046
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -242         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 58           |
|    time_elapsed         | 249          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0068200044 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.65        |
|    explained_variance   | 0.328        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.34         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0384      |
|    value_

-------- Rollout Summary --------
Total mean reward: 114.0
Standard deviation of reward: 0.0
Average successful assignments: 187.2020202020202
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -198        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 66          |
|    time_elapsed         | 284         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006369645 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.44       |
|    explained_variance   | 0.248       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0348     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 193.18243243243242
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -157         |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 74           |
|    time_elapsed         | 320          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0070928643 |
|    clip_fraction        | 0.144        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.56        |
|    explained_variance   | 0.348        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.043       |
|    value_

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 198.2378048780488
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 82          |
|    time_elapsed         | 356         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005282973 |
|    clip_fraction        | 0.0979      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.38       |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.34        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.033      |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 202.64166666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -94.5       |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 90          |
|    time_elapsed         | 391         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006175696 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.32       |
|    explained_variance   | 0.501       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.31        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0364     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 206.33843537414967
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -76.8       |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 98          |
|    time_elapsed         | 427         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005220937 |
|    clip_fraction        | 0.0881      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.34       |
|    explained_variance   | 0.468       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.033      |
|    value_loss           | 3.

In [21]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 5.75
All assignments history: [16, 13, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 255      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -344.0
Standard deviation of reward: 0.0
Average successful assignments: 15.666666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007167103 |
|    clip_fraction        | 0.0703      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.206      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.49        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.041      |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 89.84166666666667
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -368         |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 10           |
|    time_elapsed         | 43           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0096925255 |
|    clip_fraction        | 0.166        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.78        |
|    explained_variance   | 0.013        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.303        |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0398      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 128.87962962962962
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009952436 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.16        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.371       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0443     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 144.92948717948718
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -349       |
| time/                   |            |
|    fps                  | 234        |
|    iterations           | 26         |
|    time_elapsed         | 113        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00976792 |
|    clip_fraction        | 0.187      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.42      |
|    explained_variance   | 0.168      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.77       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0479    |
|    value_loss           | 3.1        |
---------

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 155.63480392156862
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -329         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 34           |
|    time_elapsed         | 149          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0075411284 |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.78        |
|    explained_variance   | 0.162        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0408      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 165.23015873015873
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -302         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 42           |
|    time_elapsed         | 184          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0062987288 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.14        |
|    explained_variance   | 0.513        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.749        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0393      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 173.77166666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -274         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 50           |
|    time_elapsed         | 219          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0072292164 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.8         |
|    explained_variance   | 0.513        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0358      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 182.57902298850576
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -230         |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 58           |
|    time_elapsed         | 254          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0064606955 |
|    clip_fraction        | 0.153        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.47        |
|    explained_variance   | 0.403        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.49         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0437      |
|    value_

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 189.91919191919192
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -181         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 66           |
|    time_elapsed         | 289          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0048839515 |
|    clip_fraction        | 0.0666       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.34        |
|    explained_variance   | 0.331        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0279      |
|    value_

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 196.30292792792793
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -136         |
| time/                   |              |
|    fps                  | 235          |
|    iterations           | 74           |
|    time_elapsed         | 321          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0064771227 |
|    clip_fraction        | 0.0984       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.34        |
|    explained_variance   | 0.398        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.56         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0342      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 201.8658536585366
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -98.1        |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 82           |
|    time_elapsed         | 353          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0061860634 |
|    clip_fraction        | 0.0936       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.27        |
|    explained_variance   | 0.342        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.46         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0339      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 206.82685185185184
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -71.8        |
| time/                   |              |
|    fps                  | 240          |
|    iterations           | 90           |
|    time_elapsed         | 383          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0061467076 |
|    clip_fraction        | 0.105        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.29        |
|    explained_variance   | 0.414        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.07         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0324      |
|    value_

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 210.93537414965985
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -54.8        |
| time/                   |              |
|    fps                  | 243          |
|    iterations           | 98           |
|    time_elapsed         | 412          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0061590923 |
|    clip_fraction        | 0.111        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.19        |
|    explained_variance   | 0.446        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.71         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0371      |
|    value_