In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def optimize_knn_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    knn_model = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 10, 20, 30, 40],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]  # 1: Manhattan distance, 2: Euclidean distance
    }
    grid_search = GridSearchCV(knn_model, param_grid, cv=5, verbose=1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print("Best parameters:", grid_search.best_params_)
    best_knn = grid_search.best_estimator_
    return best_knn, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.1.csv')
knn_model, scaler = optimize_knn_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.1.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = knn_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
MAE: 1.9645398393113325
RMSE: 2.4344408047900314
R-squared: 0.9599156505668087
RAE: 0.20517700777318204


In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -372.0
Standard deviation of reward: 0.0
Average successful assignments: 13.5
All assignments history: [8, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -378     |
| time/              |          |
|    fps             | 201      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -322.0
Standard deviation of reward: 0.0
Average successful assignments: 24.166666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -375         |
| time/                   |              |
|    fps                  | 180          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0068475376 |
|    clip_fraction        | 0.0577       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.88        |
|    explained_variance   | -0.214       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.66         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0378      |
|    value

-------- Rollout Summary --------
Total mean reward: -126.0
Standard deviation of reward: 0.0
Average successful assignments: 56.108333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 10          |
|    time_elapsed         | 77          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009969589 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.83       |
|    explained_variance   | 0.00446     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0418     |
|    value_loss           | 3

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 97.55092592592592
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 18          |
|    time_elapsed         | 159         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009558548 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.0744      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.2         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 2.87

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 125.24358974358974
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -356       |
| time/                   |            |
|    fps                  | 87         |
|    iterations           | 26         |
|    time_elapsed         | 304        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00813533 |
|    clip_fraction        | 0.127      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.57      |
|    explained_variance   | 0.193      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.872      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0409    |
|    value_loss           | 3.11       |
---------

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 141.38480392156862
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -338        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 34          |
|    time_elapsed         | 455         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007905473 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.12       |
|    explained_variance   | 0.393       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.918       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 155.56944444444446
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -315        |
| time/                   |             |
|    fps                  | 69          |
|    iterations           | 42          |
|    time_elapsed         | 614         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008417136 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 168.71833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -286        |
| time/                   |             |
|    fps                  | 66          |
|    iterations           | 50          |
|    time_elapsed         | 765         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006172146 |
|    clip_fraction        | 0.0834      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.9        |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.43        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.033      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 178.68965517241378
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -241        |
| time/                   |             |
|    fps                  | 64          |
|    iterations           | 58          |
|    time_elapsed         | 927         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007355328 |
|    clip_fraction        | 0.125       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.62       |
|    explained_variance   | 0.465       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0394     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 187.12373737373738
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -193        |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 66          |
|    time_elapsed         | 1082        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006215672 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.31       |
|    explained_variance   | 0.506       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0345     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 194.4358108108108
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -143        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 74          |
|    time_elapsed         | 1247        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006491244 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.15       |
|    explained_variance   | 0.525       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0305     |
|    value_loss           | 3.5

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 200.27845528455285
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -99.8        |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 82           |
|    time_elapsed         | 1419         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0057224156 |
|    clip_fraction        | 0.0929       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.16        |
|    explained_variance   | 0.468        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.43         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0287      |
|    value_

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 205.2925925925926
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -65.3       |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 90          |
|    time_elapsed         | 1592        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006391617 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.95       |
|    explained_variance   | 0.519       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0313     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 209.6734693877551
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -40.4        |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 98           |
|    time_elapsed         | 1766         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0063703605 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.01        |
|    explained_variance   | 0.454        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.39         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0371      |
|    value_l

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.083333333333334
All assignments history: [17, 8, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -375     |
| time/              |          |
|    fps             | 52       |
|    iterations      | 1        |
|    time_elapsed    | 19       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -340.0
Standard deviation of reward: 0.0
Average successful assignments: 19.625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 2           |
|    time_elapsed         | 41          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008592821 |
|    clip_fraction        | 0.0975      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.157      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.99        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 17.6        |

-------- Rollout Summary --------
Total mean reward: -176.0
Standard deviation of reward: 0.0
Average successful assignments: 65.275
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 10          |
|    time_elapsed         | 207         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009334944 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.0135      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0354     |
|    value_loss           | 4.5         |

-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 92.64814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 50          |
|    iterations           | 18          |
|    time_elapsed         | 365         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011535957 |
|    clip_fraction        | 0.233       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.11        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.128       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0446     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 117.18589743589743
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -347       |
| time/                   |            |
|    fps                  | 51         |
|    iterations           | 26         |
|    time_elapsed         | 518        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00809734 |
|    clip_fraction        | 0.134      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.48      |
|    explained_variance   | 0.206      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.557      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0413    |
|    value_loss           | 2.98       |
---------

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 135.21323529411765
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -327        |
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 34          |
|    time_elapsed         | 673         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008605367 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.213       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.694       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0456     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 148.02380952380952
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -301        |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 42          |
|    time_elapsed         | 821         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008394945 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.18       |
|    explained_variance   | 0.192       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.827       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0484     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 157.37333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -274        |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 50          |
|    time_elapsed         | 971         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006218126 |
|    clip_fraction        | 0.0954      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.86       |
|    explained_variance   | 0.369       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.93        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0364     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 165.26724137931035
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -232         |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 58           |
|    time_elapsed         | 1119         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0055095926 |
|    clip_fraction        | 0.0824       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.59        |
|    explained_variance   | 0.387        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.24         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0314      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 172.17171717171718
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -190        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 66          |
|    time_elapsed         | 1269        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006858385 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.54       |
|    explained_variance   | 0.408       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0359     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 178.14527027027026
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -150        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 74          |
|    time_elapsed         | 1412        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005917608 |
|    clip_fraction        | 0.104       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.41       |
|    explained_variance   | 0.357       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.716       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0342     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 183.265243902439
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -118         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 82           |
|    time_elapsed         | 1553         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0064318413 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.5         |
|    explained_variance   | 0.386        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.43         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0361      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 187.8388888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -97.6       |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 90          |
|    time_elapsed         | 1691        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.007317531 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.44       |
|    explained_variance   | 0.441       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0363     |
|    value_loss           | 3.35

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 191.62244897959184
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -86.8     |
| time/                   |           |
|    fps                  | 54        |
|    iterations           | 98        |
|    time_elapsed         | 1829      |
|    total_timesteps      | 100352    |
| train/                  |           |
|    approx_kl            | 0.0064821 |
|    clip_fraction        | 0.126     |
|    clip_range           | 0.15      |
|    entropy_loss         | -3.43     |
|    explained_variance   | 0.435     |
|    learning_rate        | 0.00018   |
|    loss                 | 1.44      |
|    n_updates            | 970       |
|    policy_gradient_loss | -0.04     |
|    value_loss           | 3.24      |
-----------------------------

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -380.0
Standard deviation of reward: 0.0
Average successful assignments: 11.0
All assignments history: [17, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -368     |
| time/              |          |
|    fps             | 63       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -258.0
Standard deviation of reward: 0.0
Average successful assignments: 36.208333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 2           |
|    time_elapsed         | 32          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008145361 |
|    clip_fraction        | 0.0901      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.0981     |
|    learning_rate        | 0.00018     |
|    loss                 | 4.04        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 119.1
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -365      |
| time/                   |           |
|    fps                  | 58        |
|    iterations           | 10        |
|    time_elapsed         | 174       |
|    total_timesteps      | 10240     |
| train/                  |           |
|    approx_kl            | 0.0100648 |
|    clip_fraction        | 0.184     |
|    clip_range           | 0.15      |
|    entropy_loss         | -5.8      |
|    explained_variance   | 0.0048    |
|    learning_rate        | 0.00018   |
|    loss                 | 0.51      |
|    n_updates            | 90        |
|    policy_gradient_loss | -0.0423   |
|    value_loss           | 4.23      |
---------------------------------------
--

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 146.96296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 18          |
|    time_elapsed         | 314         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011170417 |
|    clip_fraction        | 0.252       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.125       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0483     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 160.96794871794873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 26          |
|    time_elapsed         | 451         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008964615 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.43       |
|    explained_variance   | 0.277       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.17        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0466     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 170.65196078431373
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -322        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 34          |
|    time_elapsed         | 587         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008176582 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.85       |
|    explained_variance   | 0.394       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0492     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 177.28373015873015
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -296        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 42          |
|    time_elapsed         | 726         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008119231 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.26       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.91        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0436     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 183.92
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -267         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 50           |
|    time_elapsed         | 867          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0073204166 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.82        |
|    explained_variance   | 0.592        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.796        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0395      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 191.38362068965517
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -224         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 58           |
|    time_elapsed         | 1007         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0058598863 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.67        |
|    explained_variance   | 0.458        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.07         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0383      |
|    value_

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 198.41287878787878
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 66          |
|    time_elapsed         | 1148        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006336745 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.49       |
|    explained_variance   | 0.368       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.98        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0367     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 204.1283783783784
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -135         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 74           |
|    time_elapsed         | 1285         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0053719934 |
|    clip_fraction        | 0.0745       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.32        |
|    explained_variance   | 0.448        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.34         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0306      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 209.08739837398375
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -98         |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 82          |
|    time_elapsed         | 1424        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005644599 |
|    clip_fraction        | 0.0938      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.23       |
|    explained_variance   | 0.529       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0324     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 213.62037037037038
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -69.8        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1569         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0061754202 |
|    clip_fraction        | 0.0974       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.2         |
|    explained_variance   | 0.574        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.63         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0275      |
|    value_

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 217.44727891156464
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -52.6       |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 98          |
|    time_elapsed         | 1717        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005971286 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.19       |
|    explained_variance   | 0.635       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.55        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0363     |
|    value_loss           | 3.

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 5.0
All assignments history: [12, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -380     |
| time/              |          |
|    fps             | 61       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -314.0
Standard deviation of reward: 0.0
Average successful assignments: 21.416666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -378        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008110204 |
|    clip_fraction        | 0.0836      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | 0.0156      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.2         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 50.56666666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -369       |
| time/                   |            |
|    fps                  | 55         |
|    iterations           | 10         |
|    time_elapsed         | 183        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.00998069 |
|    clip_fraction        | 0.194      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.82      |
|    explained_variance   | 0.00159    |
|    learning_rate        | 0.00018    |
|    loss                 | 0.658      |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0407    |
|    value_loss           | 3.64       |
----------

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 110.17129629629629
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 331         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009444494 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.73       |
|    explained_variance   | 0.0674      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.576       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0427     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 134.80128205128204
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -350         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 26           |
|    time_elapsed         | 478          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0078507485 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.48        |
|    explained_variance   | 0.281        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.918        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0406      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 149.11764705882354
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -330       |
| time/                   |            |
|    fps                  | 55         |
|    iterations           | 34         |
|    time_elapsed         | 628        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00777787 |
|    clip_fraction        | 0.138      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.92      |
|    explained_variance   | 0.429      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.19       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.043     |
|    value_loss           | 3.35       |
---------

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 158.7420634920635
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -303       |
| time/                   |            |
|    fps                  | 55         |
|    iterations           | 42         |
|    time_elapsed         | 777        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00671835 |
|    clip_fraction        | 0.11       |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.23      |
|    explained_variance   | 0.538      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.24       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0394    |
|    value_loss           | 3.11       |
----------

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 168.50833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -276        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 50          |
|    time_elapsed         | 914         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007874241 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.496       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.839       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0379     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 106.0
Standard deviation of reward: 0.0
Average successful assignments: 177.75574712643677
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -233         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 58           |
|    time_elapsed         | 1063         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0065588695 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.64        |
|    explained_variance   | 0.366        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0406      |
|    value_

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 185.41666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 66           |
|    time_elapsed         | 1209         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0066670645 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.38        |
|    explained_variance   | 0.36         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.05         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.034       |
|    value_

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 191.86148648648648
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -142         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 74           |
|    time_elapsed         | 1360         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0062728487 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.29        |
|    explained_variance   | 0.409        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.42         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0338      |
|    value_

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 197.6910569105691
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -103         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 82           |
|    time_elapsed         | 1508         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0064549977 |
|    clip_fraction        | 0.112        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.456        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.999        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0332      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 202.73611111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -73.1       |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 90          |
|    time_elapsed         | 1659        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005363147 |
|    clip_fraction        | 0.0824      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.07       |
|    explained_variance   | 0.455       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.58        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0286     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 207.22534013605443
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -52.6       |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 98          |
|    time_elapsed         | 1797        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006426589 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.11       |
|    explained_variance   | 0.57        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 2.

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.333333333333333
All assignments history: [18, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 61       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -316.0
Standard deviation of reward: 0.0
Average successful assignments: 22.333333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 62          |
|    iterations           | 2           |
|    time_elapsed         | 32          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008061668 |
|    clip_fraction        | 0.091       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.112      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.82        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -116.0
Standard deviation of reward: 0.0
Average successful assignments: 96.54166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 10          |
|    time_elapsed         | 175         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010368078 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.81       |
|    explained_variance   | 0.00458     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.655       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0427     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 128.42592592592592
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 18          |
|    time_elapsed         | 320         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011289533 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.75       |
|    explained_variance   | 0.201       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.048      |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 148.61858974358975
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -353       |
| time/                   |            |
|    fps                  | 57         |
|    iterations           | 26         |
|    time_elapsed         | 466        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00796575 |
|    clip_fraction        | 0.132      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.58      |
|    explained_variance   | 0.289      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.295      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.043     |
|    value_loss           | 2.95       |
---------

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 160.06127450980392
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -334         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 34           |
|    time_elapsed         | 612          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0077776397 |
|    clip_fraction        | 0.133        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.14        |
|    explained_variance   | 0.286        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.609        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0445      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 168.61309523809524
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -311       |
| time/                   |            |
|    fps                  | 56         |
|    iterations           | 42         |
|    time_elapsed         | 757        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00847897 |
|    clip_fraction        | 0.167      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.47      |
|    explained_variance   | 0.298      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.994      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0493    |
|    value_loss           | 2.88       |
---------

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 177.86
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -285        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 50          |
|    time_elapsed         | 900         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007033619 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.29        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.49        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 3.29        |


-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 187.2255747126437
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -243         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 58           |
|    time_elapsed         | 1034         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0054948423 |
|    clip_fraction        | 0.0939       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.66        |
|    explained_variance   | 0.243        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.25         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0354      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 195.03787878787878
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -196         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 66           |
|    time_elapsed         | 1164         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0063147615 |
|    clip_fraction        | 0.113        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.33        |
|    explained_variance   | 0.375        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.5          |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0338      |
|    value_

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 201.79054054054055
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -147         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 74           |
|    time_elapsed         | 1292         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0061397823 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.15        |
|    explained_variance   | 0.472        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.47         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0343      |
|    value_

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 207.5518292682927
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -101         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 82           |
|    time_elapsed         | 1421         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0064639854 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.96        |
|    explained_variance   | 0.55         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.031       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 212.64444444444445
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -64.4       |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 90          |
|    time_elapsed         | 1545        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005390888 |
|    clip_fraction        | 0.0965      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.86       |
|    explained_variance   | 0.549       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0289     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 216.9421768707483
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -35.3        |
| time/                   |              |
|    fps                  | 60           |
|    iterations           | 98           |
|    time_elapsed         | 1658         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0060909614 |
|    clip_fraction        | 0.0973       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.77        |
|    explained_variance   | 0.589        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.26         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0294      |
|    value_l

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.666666666666667
All assignments history: [18, 14, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -368     |
| time/              |          |
|    fps             | 84       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -364.0
Standard deviation of reward: 0.0
Average successful assignments: 12.333333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008186003 |
|    clip_fraction        | 0.0914      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.0287     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.7         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0406     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 106.69166666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -365         |
| time/                   |              |
|    fps                  | 75           |
|    iterations           | 10           |
|    time_elapsed         | 135          |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0094745755 |
|    clip_fraction        | 0.184        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.81        |
|    explained_variance   | 0.00329      |
|    learning_rate        | 0.00018      |
|    loss                 | 0.675        |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0407      |
|    value_

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 134.4814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 246         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009671427 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.74       |
|    explained_variance   | 0.15        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.502       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0476     |
|    value_loss           | 2.98

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 151.35576923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -346        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 26          |
|    time_elapsed         | 360         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008848988 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.46       |
|    explained_variance   | 0.276       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.33        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0453     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 163.1299019607843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -327        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 34          |
|    time_elapsed         | 471         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008513044 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.95       |
|    explained_variance   | 0.242       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.048      |
|    value_loss           | 3.3 

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 172.73214285714286
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -302         |
| time/                   |              |
|    fps                  | 73           |
|    iterations           | 42           |
|    time_elapsed         | 585          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0074879536 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.39        |
|    explained_variance   | 0.249        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0421      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 181.96166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -276        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 50          |
|    time_elapsed         | 696         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008623855 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.224       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 190.9712643678161
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -234         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 58           |
|    time_elapsed         | 830          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0066449977 |
|    clip_fraction        | 0.0961       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.74        |
|    explained_variance   | 0.334        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.59         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0369      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 198.25631313131314
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -189         |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 66           |
|    time_elapsed         | 937          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0062225256 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.43        |
|    explained_variance   | 0.428        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.43         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.036       |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 204.08333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -146         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 74           |
|    time_elapsed         | 1056         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0065692402 |
|    clip_fraction        | 0.112        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.31        |
|    explained_variance   | 0.554        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.47         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0339      |
|    value_

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 209.0630081300813
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -107         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 82           |
|    time_elapsed         | 1168         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0055261776 |
|    clip_fraction        | 0.0941       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.26        |
|    explained_variance   | 0.533        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.52         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0283      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 213.2324074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -78.1       |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 90          |
|    time_elapsed         | 1271        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005624253 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.07       |
|    explained_variance   | 0.596       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.99        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0329     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 216.67176870748298
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -58.1        |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 98           |
|    time_elapsed         | 1376         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0049558426 |
|    clip_fraction        | 0.0818       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.02        |
|    explained_variance   | 0.662        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.39         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0304      |
|    value_

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 9.25
All assignments history: [13, 18, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 90       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -304.0
Standard deviation of reward: 0.0
Average successful assignments: 26.166666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -366         |
| time/                   |              |
|    fps                  | 83           |
|    iterations           | 2            |
|    time_elapsed         | 24           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0072508883 |
|    clip_fraction        | 0.0668       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.88        |
|    explained_variance   | 0.157        |
|    learning_rate        | 0.00018      |
|    loss                 | 3.31         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0394      |
|    value

-------- Rollout Summary --------
Total mean reward: -98.0
Standard deviation of reward: 0.0
Average successful assignments: 88.64166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 10          |
|    time_elapsed         | 125         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010260159 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.82       |
|    explained_variance   | 0.0133      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.519       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 4.1

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 128.02314814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 18          |
|    time_elapsed         | 234         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010866793 |
|    clip_fraction        | 0.248       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.186       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.729       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0492     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 147.15064102564102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -348        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 26          |
|    time_elapsed         | 336         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009910397 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.52       |
|    explained_variance   | 0.347       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.904       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 159.7058823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -333        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 34          |
|    time_elapsed         | 437         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008971657 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.11       |
|    explained_variance   | 0.343       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0487     |
|    value_loss           | 3.16

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 169.140873015873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -311        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 42          |
|    time_elapsed         | 538         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009405053 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.56       |
|    explained_variance   | 0.564       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.45        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0487     |
|    value_loss           | 3.62 

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 177.01333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -286        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 50          |
|    time_elapsed         | 635         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006257517 |
|    clip_fraction        | 0.0969      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.07       |
|    explained_variance   | 0.541       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0367     |
|    value_loss           | 3.5

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 184.1221264367816
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -246        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 58          |
|    time_elapsed         | 724         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006987291 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.82       |
|    explained_variance   | 0.405       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0408     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 106.0
Standard deviation of reward: 0.0
Average successful assignments: 190.28535353535352
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -204         |
| time/                   |              |
|    fps                  | 84           |
|    iterations           | 66           |
|    time_elapsed         | 800          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0060800826 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.57        |
|    explained_variance   | 0.389        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.14         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.037       |
|    value_

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 195.58445945945945
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 74          |
|    time_elapsed         | 853         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007193667 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.41       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.5         |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0343     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 199.98373983739836
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -123        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 82          |
|    time_elapsed         | 899         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006824901 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.36       |
|    explained_variance   | 0.505       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.99        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0321     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 114.0
Standard deviation of reward: 0.0
Average successful assignments: 203.75833333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -93.4        |
| time/                   |              |
|    fps                  | 98           |
|    iterations           | 90           |
|    time_elapsed         | 935          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0057997582 |
|    clip_fraction        | 0.114        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.43        |
|    explained_variance   | 0.543        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.56         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0314      |
|    value_

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 206.92006802721087
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -75.3       |
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 98          |
|    time_elapsed         | 969         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005399489 |
|    clip_fraction        | 0.0965      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.3        |
|    explained_variance   | 0.571       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0335     |
|    value_loss           | 3.

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.333333333333334
All assignments history: [12, 16, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 268      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -366.0
Standard deviation of reward: 0.0
Average successful assignments: 14.541666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 256         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008034301 |
|    clip_fraction        | 0.0855      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.121      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.16        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 105.65
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 241         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010805573 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.81       |
|    explained_variance   | -0.000117   |
|    learning_rate        | 0.00018     |
|    loss                 | 1.87        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 3.77        |
-

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 142.61574074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -353        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010314517 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.134       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0414     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 159.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -341        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 26          |
|    time_elapsed         | 111         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008929601 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.354       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.439       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0465     |
|    value_loss           | 2.76        |
-

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 171.42156862745097
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -318         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 34           |
|    time_elapsed         | 146          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0076205092 |
|    clip_fraction        | 0.14         |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.85        |
|    explained_variance   | 0.509        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.17         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0437      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 180.42857142857142
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -292         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 42           |
|    time_elapsed         | 180          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0070308815 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.25        |
|    explained_variance   | 0.493        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.23         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0441      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 188.58666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -265        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 50          |
|    time_elapsed         | 215         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007151505 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.83       |
|    explained_variance   | 0.361       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.838       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.041      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 196.08477011494253
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -222         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 58           |
|    time_elapsed         | 249          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0067530433 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.72        |
|    explained_variance   | 0.411        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0396      |
|    value_

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 202.01641414141415
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -178         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 66           |
|    time_elapsed         | 284          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0066296943 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.53        |
|    explained_variance   | 0.355        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.47         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0335      |
|    value_

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 207.06306306306305
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -134         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 74           |
|    time_elapsed         | 318          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0067480137 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.32        |
|    explained_variance   | 0.385        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.52         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0355      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 211.5680894308943
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -96.7       |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 82          |
|    time_elapsed         | 352         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006089556 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.15       |
|    explained_variance   | 0.471       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.76        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0337     |
|    value_loss           | 4.0

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 215.55462962962963
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -68.2        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 90           |
|    time_elapsed         | 386          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0056436043 |
|    clip_fraction        | 0.105        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.08        |
|    explained_variance   | 0.461        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.66         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0347      |
|    value_

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 219.02295918367346
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -47.8        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 98           |
|    time_elapsed         | 421          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0067636333 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.559        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.48         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0317      |
|    value_

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 5.5
All assignments history: [16, 10, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 270      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -252.0
Standard deviation of reward: 0.0
Average successful assignments: 34.625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 258         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008103466 |
|    clip_fraction        | 0.0863      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.88       |
|    explained_variance   | -0.256      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.041      |
|    value_loss           | 17.5        |

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 90.11666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 242         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009897721 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.82       |
|    explained_variance   | 0.00651     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 120.75462962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 18          |
|    time_elapsed         | 76          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009406296 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.75       |
|    explained_variance   | 0.108       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.043      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 136.0096153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -353        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 26          |
|    time_elapsed         | 111         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007082748 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.58       |
|    explained_variance   | 0.247       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.481       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 3.19

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 147.0808823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -339        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008132404 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.2        |
|    explained_variance   | 0.414       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0414     |
|    value_loss           | 2.89

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 159.74404761904762
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -315        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 42          |
|    time_elapsed         | 179         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008475111 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.51       |
|    explained_variance   | 0.455       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.69        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0443     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 171.26833333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -288        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 50          |
|    time_elapsed         | 213         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008549869 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.92       |
|    explained_variance   | 0.312       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.48        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 180.6278735632184
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -243         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 58           |
|    time_elapsed         | 248          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0074028918 |
|    clip_fraction        | 0.134        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.58        |
|    explained_variance   | 0.444        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0418      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 188.15530303030303
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -195         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 66           |
|    time_elapsed         | 282          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0071262233 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.41        |
|    explained_variance   | 0.278        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.3          |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0335      |
|    value_

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 194.0777027027027
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -149         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 74           |
|    time_elapsed         | 317          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0077324165 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.33        |
|    explained_variance   | 0.304        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.58         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0395      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 198.97764227642276
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -109         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 82           |
|    time_elapsed         | 352          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0062232553 |
|    clip_fraction        | 0.103        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.35        |
|    explained_variance   | 0.539        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.035       |
|    value_

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 203.08703703703705
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -78.2        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 90           |
|    time_elapsed         | 388          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0057815984 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.16        |
|    explained_variance   | 0.609        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.78         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0364      |
|    value_

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 206.71173469387756
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -58.5        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 98           |
|    time_elapsed         | 424          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0060798796 |
|    clip_fraction        | 0.114        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.25        |
|    explained_variance   | 0.668        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.88         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0375      |
|    value_

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -346.0
Standard deviation of reward: 0.0
Average successful assignments: 24.916666666666668
All assignments history: [15, 14, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 263      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -332.0
Standard deviation of reward: 0.0
Average successful assignments: 27.791666666666668
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 246          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0072868476 |
|    clip_fraction        | 0.0659       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.88        |
|    explained_variance   | -0.116       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.62         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0371      |
|    value

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 101.225
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009186739 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.82       |
|    explained_variance   | 0.00308     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.922       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0373     |
|    value_loss           | 4.38        |


-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 135.8564814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009481924 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.75       |
|    explained_variance   | 0.123       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.17        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 2.95

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 151.97756410256412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -352        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 26          |
|    time_elapsed         | 113         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009516819 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.52       |
|    explained_variance   | 0.292       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.66        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 162.62745098039215
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -334        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 34          |
|    time_elapsed         | 148         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007784465 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.423       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.01        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0376     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 171.66468253968253
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -309        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 42          |
|    time_elapsed         | 184         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008031162 |
|    clip_fraction        | 0.141       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.43       |
|    explained_variance   | 0.521       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.784       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 179.255
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -284         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 50           |
|    time_elapsed         | 219          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0082113445 |
|    clip_fraction        | 0.145        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.08        |
|    explained_variance   | 0.494        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.987        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0446      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 185.2772988505747
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -243        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 58          |
|    time_elapsed         | 255         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007956551 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.73       |
|    explained_variance   | 0.372       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0468     |
|    value_loss           | 3.56

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 190.59469696969697
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -198        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 66          |
|    time_elapsed         | 290         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007842584 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.52       |
|    explained_variance   | 0.222       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 195.53265765765767
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -156        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 74          |
|    time_elapsed         | 325         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006802239 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.45       |
|    explained_variance   | 0.324       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 199.8709349593496
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -117         |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 82           |
|    time_elapsed         | 361          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0058738776 |
|    clip_fraction        | 0.1          |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.28        |
|    explained_variance   | 0.499        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.79         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.035       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 203.625
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -89.3        |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 90           |
|    time_elapsed         | 393          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0054015038 |
|    clip_fraction        | 0.0969       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.15        |
|    explained_variance   | 0.571        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.63         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0336      |
|    value_loss       

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 206.9387755102041
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -69.2       |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 98          |
|    time_elapsed         | 425         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006803367 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.12       |
|    explained_variance   | 0.61        |
|    learning_rate        | 0.00018     |
|    loss                 | 1           |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0356     |
|    value_loss           | 3.0