In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def optimize_knn_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    knn_model = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 10, 20, 30, 40],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]  # 1: Manhattan distance, 2: Euclidean distance
    }
    grid_search = GridSearchCV(knn_model, param_grid, cv=5, verbose=1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print("Best parameters:", grid_search.best_params_)
    best_knn = grid_search.best_estimator_
    return best_knn, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.1.csv')
knn_model, scaler = optimize_knn_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.1.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = knn_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
MAE: 1.9645398393113325
RMSE: 2.4344408047900314
R-squared: 0.9599156505668087
RAE: 0.20517700777318204


In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 4.8
All assignments history: [3, 5, 3, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -191     |
| time/              |          |
|    fps             | 465      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 9.366666666666667
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 436          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076239156 |
|    clip_fraction        | 0.0636       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.57        |
|    explained_variance   | -0.0697      |
|    learning_rate        | 0.00018      |
|    loss                 | 3.32         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.043       |
|    value_

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 36.153333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 379         |
|    iterations           | 10          |
|    time_elapsed         | 26          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011508979 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0523      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.69        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0441     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 48.94074074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 348         |
|    iterations           | 18          |
|    time_elapsed         | 52          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012622284 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.52       |
|    explained_variance   | 0.485       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0519     |
|    value_loss           | 4.2

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 56.1
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 311         |
|    iterations           | 26          |
|    time_elapsed         | 85          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009891365 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.585       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.44        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0511     |
|    value_loss           | 4.19        |
----

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 60.384313725490195
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 266         |
|    iterations           | 34          |
|    time_elapsed         | 130         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010565063 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.659       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 64.36984126984127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 248         |
|    iterations           | 42          |
|    time_elapsed         | 173         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009459823 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.29       |
|    explained_variance   | 0.678       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 3.31

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 67.63333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -164        |
| time/                   |             |
|    fps                  | 205         |
|    iterations           | 50          |
|    time_elapsed         | 248         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009468744 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.05       |
|    explained_variance   | 0.632       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.792       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0548     |
|    value_loss           | 3.36

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 70.99195402298851
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -149       |
| time/                   |            |
|    fps                  | 171        |
|    iterations           | 58         |
|    time_elapsed         | 346        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.00938366 |
|    clip_fraction        | 0.183      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.71      |
|    explained_variance   | 0.615      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.23       |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.0552    |
|    value_loss           | 2.97       |
----------

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 74.25656565656566
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 151         |
|    iterations           | 66          |
|    time_elapsed         | 447         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008309676 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.27       |
|    explained_variance   | 0.595       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.679       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0526     |
|    value_loss           | 2.62

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 77.40720720720721
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 137         |
|    iterations           | 74          |
|    time_elapsed         | 549         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009773578 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.97       |
|    explained_variance   | 0.566       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.827       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 2.49

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 80.63170731707316
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -87.7       |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 82          |
|    time_elapsed         | 654         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009869643 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.66       |
|    explained_variance   | 0.476       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.855       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 2.35

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 83.61333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -66.9       |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 90          |
|    time_elapsed         | 758         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008771263 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.499       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.75        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0519     |
|    value_loss           | 2.56

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 86.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -47.2       |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 98          |
|    time_elapsed         | 859         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007819602 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4          |
|    explained_variance   | 0.433       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.57        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 2.18        |
---

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 5.066666666666666
All assignments history: [8, 8, 9, 4, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 81       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -134.0
Standard deviation of reward: 0.0
Average successful assignments: 14.433333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008262696 |
|    clip_fraction        | 0.0729      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.112      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.31        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0459     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 34.38666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 10          |
|    time_elapsed         | 130         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011680447 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0927      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.88        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0456     |
|    value_loss           | 5.4

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 43.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 18          |
|    time_elapsed         | 232         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011759293 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.39        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 4.54        |
--

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 50.52820512820513
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 26          |
|    time_elapsed         | 339         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007712417 |
|    clip_fraction        | 0.125       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | 0.601       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.462       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0445     |
|    value_loss           | 3.91 

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 56.23137254901961
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 34          |
|    time_elapsed         | 455         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009156654 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.47        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 3.67

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 61.717460317460315
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 42          |
|    time_elapsed         | 568         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008793083 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.26       |
|    explained_variance   | 0.685       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.709       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 66.25866666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -158        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 50          |
|    time_elapsed         | 683         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.011296911 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.95       |
|    explained_variance   | 0.66        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0589     |
|    value_loss           | 2.93

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 70.38735632183908
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -138        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 58          |
|    time_elapsed         | 796         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010468751 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.46       |
|    explained_variance   | 0.662       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.866       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 3.01

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 74.06565656565657
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 66          |
|    time_elapsed         | 908         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009321711 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.6         |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 3.01

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 77.48108108108109
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -91.6      |
| time/                   |            |
|    fps                  | 74         |
|    iterations           | 74         |
|    time_elapsed         | 1021       |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.00922442 |
|    clip_fraction        | 0.199      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.79      |
|    explained_variance   | 0.578      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.836      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.0543    |
|    value_loss           | 2.32       |
----------

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 80.59024390243903
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -75.4       |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 82          |
|    time_elapsed         | 1135        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009333612 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.53       |
|    explained_variance   | 0.58        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.575       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0504     |
|    value_loss           | 1.95

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 83.57407407407408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -60.7       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 90          |
|    time_elapsed         | 1236        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009109741 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.539       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.938       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0519     |
|    value_loss           | 2.44

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 86.2578231292517
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -46.8        |
| time/                   |              |
|    fps                  | 74           |
|    iterations           | 98           |
|    time_elapsed         | 1342         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0088328235 |
|    clip_fraction        | 0.189        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.12        |
|    explained_variance   | 0.589        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.722        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0503      |
|    value_los

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.2
All assignments history: [7, 10, 4, 7, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 86       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 10.366666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 2           |
|    time_elapsed         | 25          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007697515 |
|    clip_fraction        | 0.0613      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.153      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.04        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -64.0
Standard deviation of reward: 0.0
Average successful assignments: 35.08
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 127         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011469303 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.115       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 5.32        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 49.114814814814814
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 18          |
|    time_elapsed         | 228         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011544064 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.441       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.755       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0502     |
|    value_loss           | 4.28

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 56.14102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 26          |
|    time_elapsed         | 331         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009137334 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.649       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.649       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 3.57 

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 60.20392156862745
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 34          |
|    time_elapsed         | 432         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009621032 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.29       |
|    explained_variance   | 0.758       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.951       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 2.56 

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 63.319047619047616
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 42          |
|    time_elapsed         | 532         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009260482 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.95       |
|    explained_variance   | 0.772       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.554       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 2.65

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 66.28133333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 50          |
|    time_elapsed         | 628         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010718651 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.45       |
|    explained_variance   | 0.676       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.914       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.058      |
|    value_loss           | 2.49

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 70.26436781609195
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -115        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 58          |
|    time_elapsed         | 730         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009723035 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.96       |
|    explained_variance   | 0.66        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.591       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0602     |
|    value_loss           | 2.2 

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 74.44040404040405
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -87.8       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 66          |
|    time_elapsed         | 830         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010409977 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.63       |
|    explained_variance   | 0.643       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.609       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0546     |
|    value_loss           | 2.13

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 78.58288288288288
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -68.4       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 74          |
|    time_elapsed         | 929         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009886124 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.651       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.887       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0562     |
|    value_loss           | 2.28

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 82.59268292682927
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -53.4       |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 82          |
|    time_elapsed         | 1021        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.012977872 |
|    clip_fraction        | 0.267       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.38       |
|    explained_variance   | 0.56        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.617       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 86.08592592592592
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -44.5       |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 90          |
|    time_elapsed         | 1113        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.010739644 |
|    clip_fraction        | 0.267       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.22       |
|    explained_variance   | 0.44        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.575       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0607     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 89.32312925170068
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -33.9       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 98          |
|    time_elapsed         | 1205        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009199503 |
|    clip_fraction        | 0.216       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.6         |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0545     |
|    value_loss           | 2.24

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.8
All assignments history: [9, 2, 5, 7, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -124.0
Standard deviation of reward: 0.0
Average successful assignments: 15.966666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007864229 |
|    clip_fraction        | 0.0656      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.215      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.04        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0453     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 42.3
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 10          |
|    time_elapsed         | 111         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011451429 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0937      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.59        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0461     |
|    value_loss           | 5.53        |
--

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 54.022222222222226
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 18          |
|    time_elapsed         | 202         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012202574 |
|    clip_fraction        | 0.243       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.499       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.45        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 4.2 

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 58.84615384615385
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -181         |
| time/                   |              |
|    fps                  | 91           |
|    iterations           | 26           |
|    time_elapsed         | 292          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0107985865 |
|    clip_fraction        | 0.2          |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.44        |
|    explained_variance   | 0.687        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.92         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.055       |
|    value_los

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 62.188235294117646
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 34          |
|    time_elapsed         | 385         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011110183 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.719       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0578     |
|    value_loss           | 3.34

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 64.51587301587301
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -170        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 42          |
|    time_elapsed         | 479         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009712901 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.25       |
|    explained_variance   | 0.724       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.497       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 3.11 

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 67.16933333333333
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -159      |
| time/                   |           |
|    fps                  | 89        |
|    iterations           | 50        |
|    time_elapsed         | 571       |
|    total_timesteps      | 51200     |
| train/                  |           |
|    approx_kl            | 0.0109796 |
|    clip_fraction        | 0.205     |
|    clip_range           | 0.15      |
|    entropy_loss         | -5.96     |
|    explained_variance   | 0.747     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.463     |
|    n_updates            | 490       |
|    policy_gradient_loss | -0.0597   |
|    value_loss           | 2.27      |
------------------------------

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 70.18735632183908
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -144        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 58          |
|    time_elapsed         | 660         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010482818 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.62       |
|    explained_variance   | 0.704       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.877       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0591     |
|    value_loss           | 2.56

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 73.28686868686869
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 66          |
|    time_elapsed         | 751         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009428076 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.23       |
|    explained_variance   | 0.692       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.505       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0585     |
|    value_loss           | 2.17

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 76.38288288288288
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 74          |
|    time_elapsed         | 842         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009535254 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.84       |
|    explained_variance   | 0.644       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.605       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 79.73333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -76.9       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 82          |
|    time_elapsed         | 932         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008421361 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.676       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.992       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0494     |
|    value_loss           | 2.48

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 82.9562962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -56.3       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 90          |
|    time_elapsed         | 1023        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009300746 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.27       |
|    explained_variance   | 0.7         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.544       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.35 

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 85.94421768707483
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -40.3       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 98          |
|    time_elapsed         | 1116        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009027133 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.14       |
|    explained_variance   | 0.62        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.454       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 1.98

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.6
All assignments history: [7, 8, 6, 9, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -176.0
Standard deviation of reward: 0.0
Average successful assignments: 8.033333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008447057 |
|    clip_fraction        | 0.0783      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.139      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.93        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0476     |
|    value_loss           | 17

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 28.066666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 10          |
|    time_elapsed         | 113         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011669256 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.103       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 41.681481481481484
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 18          |
|    time_elapsed         | 204         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011990389 |
|    clip_fraction        | 0.235       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.83        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 47.184615384615384
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 26          |
|    time_elapsed         | 295         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.012615821 |
|    clip_fraction        | 0.239       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.635       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.404       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0557     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 51.305882352941175
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 34          |
|    time_elapsed         | 385         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010067864 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.737       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.815       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 55.20634920634921
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 42          |
|    time_elapsed         | 475         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009016123 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.794       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.662       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0525     |
|    value_loss           | 2.43

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 59.43066666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 50          |
|    time_elapsed         | 564         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007759204 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.05       |
|    explained_variance   | 0.789       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.645       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0487     |
|    value_loss           | 2.53

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 63.39195402298851
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -147        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 58          |
|    time_elapsed         | 659         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007995127 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.65       |
|    explained_variance   | 0.784       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.726       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 67.11919191919192
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -126        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 66          |
|    time_elapsed         | 757         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010215625 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.2        |
|    explained_variance   | 0.708       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.526       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0588     |
|    value_loss           | 1.91

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 70.75675675675676
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -104        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 74          |
|    time_elapsed         | 851         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009455833 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.832       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 2.3 

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 74.22845528455285
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -84.1       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 948         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008268258 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.55       |
|    explained_variance   | 0.568       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.938       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 2.38

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 77.48
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -64.6       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 90          |
|    time_elapsed         | 1047        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009169146 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.43       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.825       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0511     |
|    value_loss           | 2.46        |
--

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 80.3265306122449
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -53.1       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 98          |
|    time_elapsed         | 1146        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.010024156 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.2        |
|    explained_variance   | 0.466       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.622       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0526     |
|    value_loss           | 2.27 

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.333333333333333
All assignments history: [4, 3, 9, 3, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -190     |
| time/              |          |
|    fps             | 93       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -172.0
Standard deviation of reward: 0.0
Average successful assignments: 7.866666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -189        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 2           |
|    time_elapsed         | 23          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007820137 |
|    clip_fraction        | 0.0652      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.256      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.38        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 16

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 26.326666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 10          |
|    time_elapsed         | 119         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011501116 |
|    clip_fraction        | 0.212       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0627      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.11        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0444     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 40.96296296296296
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -185       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 18         |
|    time_elapsed         | 219        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.01186309 |
|    clip_fraction        | 0.231      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.5       |
|    explained_variance   | 0.575      |
|    learning_rate        | 0.00018    |
|    loss                 | 2.03       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0519    |
|    value_loss           | 3.68       |
----------

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 49.95128205128205
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 26          |
|    time_elapsed         | 318         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009647829 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.749       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.718       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0531     |
|    value_loss           | 2.88

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 55.18627450980392
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 34          |
|    time_elapsed         | 418         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010178762 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.34       |
|    explained_variance   | 0.837       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.601       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 2.1  

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 59.233333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 42          |
|    time_elapsed         | 517         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009978332 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.07       |
|    explained_variance   | 0.852       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.371       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.059      |
|    value_loss           | 1.84

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 62.834666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -149        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 50          |
|    time_elapsed         | 610         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010869841 |
|    clip_fraction        | 0.226       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.52       |
|    explained_variance   | 0.804       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.656       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0598     |
|    value_loss           | 1.8

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 66.32758620689656
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 58          |
|    time_elapsed         | 709         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009717556 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.01       |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.724       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.1 

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 69.67676767676768
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -94.4       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 66          |
|    time_elapsed         | 807         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010298414 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.72       |
|    explained_variance   | 0.636       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.633       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 1.75

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 73.3018018018018
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -74.9       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 74          |
|    time_elapsed         | 907         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010044804 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.58       |
|    explained_variance   | 0.594       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.539       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0558     |
|    value_loss           | 1.77 

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 76.58536585365853
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62         |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 82          |
|    time_elapsed         | 1007        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009009867 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.564       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.516       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0567     |
|    value_loss           | 2.07

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 79.52814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -51.2       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 90          |
|    time_elapsed         | 1106        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008807993 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.18       |
|    explained_variance   | 0.678       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.479       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.27

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 82.18163265306123
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -42.1       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 98          |
|    time_elapsed         | 1206        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009835336 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.797       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.694       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.1 

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 5.933333333333334
All assignments history: [7, 5, 4, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 82       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -100.0
Standard deviation of reward: 0.0
Average successful assignments: 20.433333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -189         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 2            |
|    time_elapsed         | 25           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076278336 |
|    clip_fraction        | 0.0623       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.57        |
|    explained_variance   | -0.372       |
|    learning_rate        | 0.00018      |
|    loss                 | 1.8          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.045       |
|    value

-------- Rollout Summary --------
Total mean reward: -32.0
Standard deviation of reward: 0.0
Average successful assignments: 30.64
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 10          |
|    time_elapsed         | 122         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011630618 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.109       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0452     |
|    value_loss           | 5.38        |
-

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 41.87777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 18          |
|    time_elapsed         | 214         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011825193 |
|    clip_fraction        | 0.229       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.52       |
|    explained_variance   | 0.487       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.14        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 4.4

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 48.748717948717946
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 26          |
|    time_elapsed         | 306         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010008553 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.693       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.111       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 53.47254901960784
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -178         |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 34           |
|    time_elapsed         | 400          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0090238545 |
|    clip_fraction        | 0.185        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.37        |
|    explained_variance   | 0.76         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.914        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0517      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 57.79047619047619
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 496         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009856423 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.15       |
|    explained_variance   | 0.78        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 2.52

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 62.31066666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -153        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 593         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010771193 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.744       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.338       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0609     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 66.62528735632183
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -131        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 58          |
|    time_elapsed         | 691         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008710397 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.13       |
|    explained_variance   | 0.719       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.86        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0577     |
|    value_loss           | 2.33

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 70.8020202020202
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -105        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 66          |
|    time_elapsed         | 789         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008135242 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.82       |
|    explained_variance   | 0.66        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.733       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.44 

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 74.73243243243243
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -84.2        |
| time/                   |              |
|    fps                  | 85           |
|    iterations           | 74           |
|    time_elapsed         | 887          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0095666945 |
|    clip_fraction        | 0.188        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.6         |
|    explained_variance   | 0.634        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.66         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0548      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 78.16666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -68.3       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 82          |
|    time_elapsed         | 984         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009602181 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.42       |
|    explained_variance   | 0.649       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.966       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 81.20666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -55.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 90          |
|    time_elapsed         | 1082        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009481425 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.28       |
|    explained_variance   | 0.569       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.445       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.02

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 84.04965986394558
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -43.9       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1179        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009811649 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.11       |
|    explained_variance   | 0.528       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.741       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 2.45

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -200.0
Standard deviation of reward: 0.0
Average successful assignments: 2.1333333333333333
All assignments history: [8, 5, 7, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 99       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -148.0
Standard deviation of reward: 0.0
Average successful assignments: 10.966666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007977322 |
|    clip_fraction        | 0.0705      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.151      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.09        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0449     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 31.873333333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 10          |
|    time_elapsed         | 111         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012394566 |
|    clip_fraction        | 0.24        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.104       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.995       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 48.592592592592595
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 18          |
|    time_elapsed         | 196         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.013143944 |
|    clip_fraction        | 0.258       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.579       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.428       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 55.246153846153845
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -183         |
| time/                   |              |
|    fps                  | 94           |
|    iterations           | 26           |
|    time_elapsed         | 282          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0111025125 |
|    clip_fraction        | 0.217        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.45        |
|    explained_variance   | 0.747        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.907        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0549      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 59.207843137254905
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 34          |
|    time_elapsed         | 368         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011188386 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.807       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.207       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0563     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 62.95079365079365
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -170        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 42          |
|    time_elapsed         | 453         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009824291 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.17       |
|    explained_variance   | 0.827       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.736       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 66.66933333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -154       |
| time/                   |            |
|    fps                  | 94         |
|    iterations           | 50         |
|    time_elapsed         | 539        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.00921167 |
|    clip_fraction        | 0.184      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.66      |
|    explained_variance   | 0.759      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.587      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0573    |
|    value_loss           | 2.47       |
----------

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 70.52413793103449
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -128        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 58          |
|    time_elapsed         | 622         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010386162 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.18       |
|    explained_variance   | 0.671       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.834       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0558     |
|    value_loss           | 2.54

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 74.05252525252526
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 66          |
|    time_elapsed         | 700         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009360602 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.84       |
|    explained_variance   | 0.693       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.549       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 2.05

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 77.34954954954955
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -79.9       |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 74          |
|    time_elapsed         | 774         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008752334 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.54       |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.486       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0524     |
|    value_loss           | 1.86

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 80.6430894308943
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62.2       |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 82          |
|    time_elapsed         | 847         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009424549 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.67        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.696       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.23 

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 83.7237037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -47.7       |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 90          |
|    time_elapsed         | 920         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009991094 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.19       |
|    explained_variance   | 0.769       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.946       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 2.09 

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 86.62653061224489
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36.8       |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 98          |
|    time_elapsed         | 993         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009869364 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.09       |
|    explained_variance   | 0.754       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.798       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 2.4 

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 5.0
All assignments history: [6, 6, 9, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 125      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -114.0
Standard deviation of reward: 0.0
Average successful assignments: 17.8
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008505357 |
|    clip_fraction        | 0.0809      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.0809     |
|    learning_rate        | 0.00018     |
|    loss                 | 3.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 17.6        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 51.27333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 10          |
|    time_elapsed         | 91          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011398938 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0192      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.22        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 5.97 

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 60.18518518518518
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -186         |
| time/                   |              |
|    fps                  | 110          |
|    iterations           | 18           |
|    time_elapsed         | 166          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0130678825 |
|    clip_fraction        | 0.254        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.51        |
|    explained_variance   | 0.442        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.84         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0564      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: -30.0
Standard deviation of reward: 0.0
Average successful assignments: 60.743589743589745
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 26          |
|    time_elapsed         | 239         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009422563 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | 0.571       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.26        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 61.89019607843137
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 34          |
|    time_elapsed         | 314         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008733393 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.608       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.541       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 62.38253968253968
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 42          |
|    time_elapsed         | 387         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009609189 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.21       |
|    explained_variance   | 0.616       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 63.946666666666665
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -158        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 50          |
|    time_elapsed         | 460         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.011949001 |
|    clip_fraction        | 0.231       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.89       |
|    explained_variance   | 0.55        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 3.46

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 66.53218390804598
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -141         |
| time/                   |              |
|    fps                  | 110          |
|    iterations           | 58           |
|    time_elapsed         | 537          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0112391375 |
|    clip_fraction        | 0.211        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.61        |
|    explained_variance   | 0.532        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.14         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0569      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 69.4090909090909
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 66          |
|    time_elapsed         | 627         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010694705 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.35       |
|    explained_variance   | 0.576       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0568     |
|    value_loss           | 2.93 

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 72.47747747747748
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 74          |
|    time_elapsed         | 697         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008882374 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5          |
|    explained_variance   | 0.527       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.74

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 75.48130081300813
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -89.2       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 82          |
|    time_elapsed         | 776         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009699932 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.7        |
|    explained_variance   | 0.531       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.806       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0543     |
|    value_loss           | 2.33

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 78.43777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -69.6       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 90          |
|    time_elapsed         | 851         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008502377 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.52       |
|    explained_variance   | 0.562       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.619       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 2.36

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 81.33469387755102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -55.8       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 98          |
|    time_elapsed         | 925         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009987178 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.34       |
|    explained_variance   | 0.607       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.466       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0526     |
|    value_loss           | 1.62

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.933333333333334
All assignments history: [7, 10, 9, 11, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -182     |
| time/              |          |
|    fps             | 140      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 10.9
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 132         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008273913 |
|    clip_fraction        | 0.075       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.193      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.41        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 16.6        |
-

-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 28.24
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 10          |
|    time_elapsed         | 82          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011933784 |
|    clip_fraction        | 0.216       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0915      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 5.24        |
-

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 44.17777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 18          |
|    time_elapsed         | 150         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.013897307 |
|    clip_fraction        | 0.273       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.448       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.099       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 4.27

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 51.9974358974359
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 26          |
|    time_elapsed         | 217         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007833895 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.68        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.991       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0445     |
|    value_loss           | 3.3   

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 57.61764705882353
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 34          |
|    time_elapsed         | 285         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008081072 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.35       |
|    explained_variance   | 0.717       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0489     |
|    value_loss           | 3.13

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 62.265079365079366
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 42          |
|    time_elapsed         | 352         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009705057 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.13       |
|    explained_variance   | 0.706       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.812       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0567     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 66.396
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -154        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 50          |
|    time_elapsed         | 426         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008957064 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.76       |
|    explained_variance   | 0.663       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 2.64        |
-

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 70.56551724137931
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -134        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 58          |
|    time_elapsed         | 496         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007123756 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.29       |
|    explained_variance   | 0.575       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.45        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0457     |
|    value_loss           | 3.28

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 74.3959595959596
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -111        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 66          |
|    time_elapsed         | 564         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008578861 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.535       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.763       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 2.87 

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 78.05225225225226
All assignments history: []
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 200      |
|    ep_rew_mean          | -85.4    |
| time/                   |          |
|    fps                  | 119      |
|    iterations           | 74       |
|    time_elapsed         | 631      |
|    total_timesteps      | 75776    |
| train/                  |          |
|    approx_kl            | 0.008644 |
|    clip_fraction        | 0.175    |
|    clip_range           | 0.15     |
|    entropy_loss         | -4.52    |
|    explained_variance   | 0.512    |
|    learning_rate        | 0.00018  |
|    loss                 | 0.843    |
|    n_updates            | 730      |
|    policy_gradient_loss | -0.0481  |
|    value_loss           | 2.39     |
--------------------------------------
-------- Ro

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 81.81056910569106
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -63         |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 82          |
|    time_elapsed         | 698         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008590812 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.462       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0509     |
|    value_loss           | 2.75

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 85.35629629629629
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -42.1        |
| time/                   |              |
|    fps                  | 120          |
|    iterations           | 90           |
|    time_elapsed         | 764          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0091548925 |
|    clip_fraction        | 0.169        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4           |
|    explained_variance   | 0.419        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.7          |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0508      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 88.42244897959183
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -27.2       |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 98          |
|    time_elapsed         | 831         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008615371 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.83       |
|    explained_variance   | 0.334       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.789       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 2.41