In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def optimize_knn_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    knn_model = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 10, 20, 30, 40],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]  # 1: Manhattan distance, 2: Euclidean distance
    }
    grid_search = GridSearchCV(knn_model, param_grid, cv=5, verbose=1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print("Best parameters:", grid_search.best_params_)
    best_knn = grid_search.best_estimator_
    return best_knn, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
knn_model, scaler = optimize_knn_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = knn_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
MAE: 1.7258836094021666
RMSE: 2.159748576425794
R-squared: 0.9684512036561819
RAE: 0.18025169439477964


In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -176.0
Standard deviation of reward: 0.0
Average successful assignments: 10.0
All assignments history: [9, 6, 6, 5, 4, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 429      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -172.0
Standard deviation of reward: 0.0
Average successful assignments: 11.033333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 411         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007676767 |
|    clip_fraction        | 0.0679      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.297      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.63        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0433     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 47.72
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 347         |
|    iterations           | 10          |
|    time_elapsed         | 29          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012315046 |
|    clip_fraction        | 0.229       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.111       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.5         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0487     |
|    value_loss           | 5.12        |
--

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 57.737037037037034
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 328         |
|    iterations           | 18          |
|    time_elapsed         | 56          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.013147831 |
|    clip_fraction        | 0.256       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.435       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.756       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0529     |
|    value_loss           | 4.5

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 62.34871794871795
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 274         |
|    iterations           | 26          |
|    time_elapsed         | 97          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010240248 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.606       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 3.82

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 67.71372549019608
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 246         |
|    iterations           | 34          |
|    time_elapsed         | 141         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010271335 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.36       |
|    explained_variance   | 0.697       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 3.22

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 71.57619047619048
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 225         |
|    iterations           | 42          |
|    time_elapsed         | 190         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010809657 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.11       |
|    explained_variance   | 0.675       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.36        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0566     |
|    value_loss           | 2.95

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 75.16133333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -152        |
| time/                   |             |
|    fps                  | 181         |
|    iterations           | 50          |
|    time_elapsed         | 281         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010217531 |
|    clip_fraction        | 0.216       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.65       |
|    explained_variance   | 0.648       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.619       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0586     |
|    value_loss           | 2.68

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 78.57701149425287
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 155         |
|    iterations           | 58          |
|    time_elapsed         | 381         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010108363 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.17       |
|    explained_variance   | 0.509       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0566     |
|    value_loss           | 3.07

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 82.13434343434344
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 139         |
|    iterations           | 66          |
|    time_elapsed         | 483         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010314072 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.74       |
|    explained_variance   | 0.542       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.539       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0588     |
|    value_loss           | 2.14

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 85.45585585585586
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -77.2       |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 74          |
|    time_elapsed         | 586         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010318207 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.42       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0518     |
|    value_loss           | 2.82

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 88.60162601626017
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -54.8       |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 82          |
|    time_elapsed         | 692         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.010632609 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.12       |
|    explained_variance   | 0.45        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.677       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 2.09

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 91.43407407407408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36.4       |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 90          |
|    time_elapsed         | 792         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008107694 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.94       |
|    explained_variance   | 0.387       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.795       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 2.17

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 94.05714285714286
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -23.6       |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 98          |
|    time_elapsed         | 897         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008561272 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.73       |
|    explained_variance   | 0.397       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.614       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0476     |
|    value_loss           | 1.76

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.2
All assignments history: [10, 8, 4, 4, 7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 92       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -146.0
Standard deviation of reward: 0.0
Average successful assignments: 12.133333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008111638 |
|    clip_fraction        | 0.0723      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.259      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.93        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0457     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 32.44
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 10          |
|    time_elapsed         | 129         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012993993 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0675      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.37        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0505     |
|    value_loss           | 5.08        |
-

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 43.95185185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 18          |
|    time_elapsed         | 231         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012240371 |
|    clip_fraction        | 0.224       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.52       |
|    explained_variance   | 0.478       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.88        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0509     |
|    value_loss           | 4.11

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 52.99487179487179
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 26           |
|    time_elapsed         | 337          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0083111115 |
|    clip_fraction        | 0.14         |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.46        |
|    explained_variance   | 0.655        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.936        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0475      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 58.59411764705882
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -177       |
| time/                   |            |
|    fps                  | 76         |
|    iterations           | 34         |
|    time_elapsed         | 454        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00800357 |
|    clip_fraction        | 0.13       |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.35      |
|    explained_variance   | 0.751      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.67       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0483    |
|    value_loss           | 2.78       |
----------

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 62.768253968253966
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -167       |
| time/                   |            |
|    fps                  | 75         |
|    iterations           | 42         |
|    time_elapsed         | 571        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00990072 |
|    clip_fraction        | 0.188      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.11      |
|    explained_variance   | 0.75       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.709      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0579    |
|    value_loss           | 2.42       |
---------

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 66.184
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -153        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 50          |
|    time_elapsed         | 684         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009878999 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.711       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.737       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 2.4         |
-

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 69.1816091954023
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -134      |
| time/                   |           |
|    fps                  | 74        |
|    iterations           | 58        |
|    time_elapsed         | 795       |
|    total_timesteps      | 59392     |
| train/                  |           |
|    approx_kl            | 0.0094802 |
|    clip_fraction        | 0.169     |
|    clip_range           | 0.15      |
|    entropy_loss         | -5.46     |
|    explained_variance   | 0.71      |
|    learning_rate        | 0.00018   |
|    loss                 | 0.649     |
|    n_updates            | 570       |
|    policy_gradient_loss | -0.0525   |
|    value_loss           | 2.1       |
-------------------------------

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 72.14848484848486
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -114        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 66          |
|    time_elapsed         | 909         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008415948 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.13       |
|    explained_variance   | 0.686       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.576       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 75.24054054054054
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -96.8       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 74          |
|    time_elapsed         | 1022        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010432034 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.91       |
|    explained_variance   | 0.626       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.477       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.19

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 78.47073170731707
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -80.2        |
| time/                   |              |
|    fps                  | 74           |
|    iterations           | 82           |
|    time_elapsed         | 1130         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0091295075 |
|    clip_fraction        | 0.172        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.62        |
|    explained_variance   | 0.632        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.594        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0495      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 81.65925925925926
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62.7       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 90          |
|    time_elapsed         | 1233        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008739805 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.655       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.671       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0539     |
|    value_loss           | 2.45

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 84.62244897959184
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -44.5       |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 98          |
|    time_elapsed         | 1337        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009176886 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.13       |
|    explained_variance   | 0.688       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.815       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0516     |
|    value_loss           | 2.06

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 4.066666666666666
All assignments history: [10, 7, 9, 4, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -184     |
| time/              |          |
|    fps             | 90       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 13.0
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -186      |
| time/                   |           |
|    fps                  | 86        |
|    iterations           | 2         |
|    time_elapsed         | 23        |
|    total_timesteps      | 2048      |
| train/                  |           |
|    approx_kl            | 0.0085703 |
|    clip_fraction        | 0.0859    |
|    clip_range           | 0.15      |
|    entropy_loss         | -6.57     |
|    explained_variance   | -0.175    |
|    learning_rate        | 0.00018   |
|    loss                 | 2.66      |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0492   |
|    value_loss           | 16.4      |
---------------------------------------
-

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 20.886666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 10          |
|    time_elapsed         | 125         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011383587 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0773      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.806       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0445     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 41.60740740740741
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 18          |
|    time_elapsed         | 226         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010747898 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.528       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0498     |
|    value_loss           | 4.12 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 51.65128205128205
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 26          |
|    time_elapsed         | 327         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.011255324 |
|    clip_fraction        | 0.212       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.641       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.973       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 3.61

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 57.68235294117647
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 34          |
|    time_elapsed         | 426         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007668885 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.744       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0483     |
|    value_loss           | 3.1 

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 62.285714285714285
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -172         |
| time/                   |              |
|    fps                  | 82           |
|    iterations           | 42           |
|    time_elapsed         | 523          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0082978755 |
|    clip_fraction        | 0.13         |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.23        |
|    explained_variance   | 0.778        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.811        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0501      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 66.06133333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -158        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 50          |
|    time_elapsed         | 623         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009785894 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.89       |
|    explained_variance   | 0.776       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.863       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 2.8 

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 69.37931034482759
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -138         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 58           |
|    time_elapsed         | 725          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0106147025 |
|    clip_fraction        | 0.224        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.42        |
|    explained_variance   | 0.75         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.447        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0608      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 72.64242424242424
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -115        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 66          |
|    time_elapsed         | 824         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009798399 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.717       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.398       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0594     |
|    value_loss           | 2.12

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 75.57747747747747
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -94.8       |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 74          |
|    time_elapsed         | 920         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009864185 |
|    clip_fraction        | 0.212       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.688       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.633       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.09

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 78.33252032520325
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -80.5      |
| time/                   |            |
|    fps                  | 82         |
|    iterations           | 82         |
|    time_elapsed         | 1014       |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00911557 |
|    clip_fraction        | 0.183      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.58      |
|    explained_variance   | 0.685      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.782      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0507    |
|    value_loss           | 2.13       |
----------

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 80.69555555555556
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -67.6     |
| time/                   |           |
|    fps                  | 83        |
|    iterations           | 90        |
|    time_elapsed         | 1106      |
|    total_timesteps      | 92160     |
| train/                  |           |
|    approx_kl            | 0.0095846 |
|    clip_fraction        | 0.206     |
|    clip_range           | 0.15      |
|    entropy_loss         | -4.42     |
|    explained_variance   | 0.65      |
|    learning_rate        | 0.00018   |
|    loss                 | 0.602     |
|    n_updates            | 890       |
|    policy_gradient_loss | -0.0534   |
|    value_loss           | 2.02      |
------------------------------

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 83.20952380952382
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -58.5       |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 98          |
|    time_elapsed         | 1197        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.010277225 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.68        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.425       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 1.9 

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 5.933333333333334
All assignments history: [6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 99       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -168.0
Standard deviation of reward: 0.0
Average successful assignments: 9.466666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007435457 |
|    clip_fraction        | 0.062       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.137      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.81        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 18

-------- Rollout Summary --------
Total mean reward: -104.0
Standard deviation of reward: 0.0
Average successful assignments: 24.986666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 10          |
|    time_elapsed         | 110         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011746468 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.119       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 38.05925925925926
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -184         |
| time/                   |              |
|    fps                  | 91           |
|    iterations           | 18           |
|    time_elapsed         | 200          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0134723205 |
|    clip_fraction        | 0.285        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.51        |
|    explained_variance   | 0.515        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.77         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0584      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 47.0974358974359
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -181         |
| time/                   |              |
|    fps                  | 91           |
|    iterations           | 26           |
|    time_elapsed         | 290          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0096791005 |
|    clip_fraction        | 0.162        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.47        |
|    explained_variance   | 0.675        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.65         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0511      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 53.54705882352941
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 34          |
|    time_elapsed         | 386         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008622995 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.72        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.628       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 3.1 

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 59.05873015873016
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -167         |
| time/                   |              |
|    fps                  | 89           |
|    iterations           | 42           |
|    time_elapsed         | 479          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0088862255 |
|    clip_fraction        | 0.169        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.09        |
|    explained_variance   | 0.732        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.875        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0538      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 63.96666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -149        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 50          |
|    time_elapsed         | 571         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009139763 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.701       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.975       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.054      |
|    value_loss           | 2.6 

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 68.75632183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -126        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 58          |
|    time_elapsed         | 663         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009283233 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.27       |
|    explained_variance   | 0.618       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.921       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0532     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 73.02424242424243
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -104        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 66          |
|    time_elapsed         | 754         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009804483 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.97       |
|    explained_variance   | 0.596       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.672       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.4 

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 76.95225225225225
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -84.4       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 74          |
|    time_elapsed         | 844         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009874372 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.66       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.658       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0504     |
|    value_loss           | 2.19

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 80.86666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -65.2       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 82          |
|    time_elapsed         | 935         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009781081 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.38       |
|    explained_variance   | 0.592       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.649       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0546     |
|    value_loss           | 2.28

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 84.50814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -46.3       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1028        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009525068 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.07       |
|    explained_variance   | 0.551       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.436       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0516     |
|    value_loss           | 2.12

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 87.83061224489796
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -29.1       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 98          |
|    time_elapsed         | 1121        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008055213 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.87       |
|    explained_variance   | 0.511       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.13        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 2.58

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 5.6
All assignments history: [4, 4, 4, 5, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -190     |
| time/              |          |
|    fps             | 100      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -140.0
Standard deviation of reward: 0.0
Average successful assignments: 13.966666666666667
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 94           |
|    iterations           | 2            |
|    time_elapsed         | 21           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0077665006 |
|    clip_fraction        | 0.0669       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.57        |
|    explained_variance   | -0.19        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.58         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.044       |
|    value

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 30.92
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 10          |
|    time_elapsed         | 113         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011294245 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0546      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.41        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 5.37        |
-

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 49.25185185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 18          |
|    time_elapsed         | 206         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012928271 |
|    clip_fraction        | 0.238       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.418       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.984       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0511     |
|    value_loss           | 4.3 

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 59.35128205128205
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 26          |
|    time_elapsed         | 299         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010228448 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.584       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.615       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0497     |
|    value_loss           | 4.08

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 66.08823529411765
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 34          |
|    time_elapsed         | 389         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009576775 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.35       |
|    explained_variance   | 0.628       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.868       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0516     |
|    value_loss           | 3.65

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 70.47777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 42          |
|    time_elapsed         | 479         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011247784 |
|    clip_fraction        | 0.224       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.18       |
|    explained_variance   | 0.555       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.703       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0576     |
|    value_loss           | 3.71

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 73.892
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -156       |
| time/                   |            |
|    fps                  | 89         |
|    iterations           | 50         |
|    time_elapsed         | 571        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.01129554 |
|    clip_fraction        | 0.228      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.9       |
|    explained_variance   | 0.49       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.45       |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0591    |
|    value_loss           | 3.52       |
---------------------

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 76.8103448275862
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -140        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 58          |
|    time_elapsed         | 670         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010861743 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.54       |
|    explained_variance   | 0.497       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.798       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0566     |
|    value_loss           | 3.28 

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 79.57272727272728
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 768         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009061403 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.35       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0498     |
|    value_loss           | 3.28

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 82.37747747747748
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -105        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 74          |
|    time_elapsed         | 864         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008947348 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.82       |
|    explained_variance   | 0.512       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0524     |
|    value_loss           | 3.25

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 85.13252032520325
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -82.8       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 82          |
|    time_elapsed         | 964         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008968555 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.45       |
|    explained_variance   | 0.489       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 2.95

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 87.8237037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -59.6       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 90          |
|    time_elapsed         | 1062        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008408615 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.12       |
|    explained_variance   | 0.474       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.08        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 2.65 

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 90.29183673469387
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -41.1        |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 98           |
|    time_elapsed         | 1161         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0072242864 |
|    clip_fraction        | 0.144        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.05        |
|    explained_variance   | 0.473        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.826        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0438      |
|    value_lo

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -182.0
Standard deviation of reward: 0.0
Average successful assignments: 8.133333333333333
All assignments history: [7, 7, 4, 8, 6, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 93       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 10.7
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 2           |
|    time_elapsed         | 23          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007621209 |
|    clip_fraction        | 0.0582      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.254      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.33        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 19.4        |
-

-------- Rollout Summary --------
Total mean reward: -130.0
Standard deviation of reward: 0.0
Average successful assignments: 19.966666666666665
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -187       |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 10         |
|    time_elapsed         | 118        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01260107 |
|    clip_fraction        | 0.215      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.54      |
|    explained_variance   | 0.159      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.37       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0484    |
|    value_loss           | 4.97       |
-------

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 37.94814814814815
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -186         |
| time/                   |              |
|    fps                  | 85           |
|    iterations           | 18           |
|    time_elapsed         | 215          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0121631725 |
|    clip_fraction        | 0.24         |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.53        |
|    explained_variance   | 0.474        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0531      |
|    value_l

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 45.19230769230769
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 26          |
|    time_elapsed         | 314         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010988176 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | 0.655       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 3.69

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 52.254901960784316
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 34          |
|    time_elapsed         | 412         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008385444 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.3         |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0484     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 56.787301587301585
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 42          |
|    time_elapsed         | 505         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008526421 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.34       |
|    explained_variance   | 0.789       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.688       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 2.5

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 60.614666666666665
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 50          |
|    time_elapsed         | 599         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009145106 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.11       |
|    explained_variance   | 0.78        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 64.7367816091954
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -149        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 58          |
|    time_elapsed         | 699         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008454662 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.72       |
|    explained_variance   | 0.732       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.653       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 2.39 

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 68.67676767676768
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -125        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 66          |
|    time_elapsed         | 796         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010192936 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.2        |
|    explained_variance   | 0.658       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.599       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0566     |
|    value_loss           | 2.24

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 72.63153153153154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -98.7       |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 74          |
|    time_elapsed         | 894         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.011873824 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.77       |
|    explained_variance   | 0.613       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.625       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0575     |
|    value_loss           | 2.11

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 76.52520325203253
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -74.7      |
| time/                   |            |
|    fps                  | 84         |
|    iterations           | 82         |
|    time_elapsed         | 993        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01021496 |
|    clip_fraction        | 0.206      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.63      |
|    explained_variance   | 0.612      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.583      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0543    |
|    value_loss           | 2.17       |
----------

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 80.2762962962963
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -58.6        |
| time/                   |              |
|    fps                  | 84           |
|    iterations           | 90           |
|    time_elapsed         | 1090         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0101589095 |
|    clip_fraction        | 0.222        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.43        |
|    explained_variance   | 0.682        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.653        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.055       |
|    value_los

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 83.93741496598639
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -46.2     |
| time/                   |           |
|    fps                  | 84        |
|    iterations           | 98        |
|    time_elapsed         | 1189      |
|    total_timesteps      | 100352    |
| train/                  |           |
|    approx_kl            | 0.0113027 |
|    clip_fraction        | 0.241     |
|    clip_range           | 0.15      |
|    entropy_loss         | -4.16     |
|    explained_variance   | 0.596     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.728     |
|    n_updates            | 970       |
|    policy_gradient_loss | -0.0551   |
|    value_loss           | 1.99      |
------------------------------

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -198.0
Standard deviation of reward: 0.0
Average successful assignments: 2.8
All assignments history: [5, 8, 10, 6, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 99       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -158.0
Standard deviation of reward: 0.0
Average successful assignments: 9.366666666666667
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 91           |
|    iterations           | 2            |
|    time_elapsed         | 22           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0077402056 |
|    clip_fraction        | 0.0666       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.57        |
|    explained_variance   | -0.172       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.04         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0442      |
|    value_

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 34.17333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 10          |
|    time_elapsed         | 115         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012233054 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0697      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.01        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 5.2

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 46.592592592592595
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 18          |
|    time_elapsed         | 205         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.013531935 |
|    clip_fraction        | 0.253       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.52       |
|    explained_variance   | 0.399       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0505     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 51.62820512820513
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 26          |
|    time_elapsed         | 294         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010501258 |
|    clip_fraction        | 0.19        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.659       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.11        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0516     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 56.409803921568624
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 34          |
|    time_elapsed         | 390         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008478798 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.73        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0454     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 61.34126984126984
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 42          |
|    time_elapsed         | 485         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008201468 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.36       |
|    explained_variance   | 0.815       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.848       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 2.62

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 65.89466666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 50          |
|    time_elapsed         | 581         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009913322 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.15       |
|    explained_variance   | 0.842       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 2.25

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 69.72068965517241
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -154        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 58          |
|    time_elapsed         | 677         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010226492 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.83        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.702       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0589     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 73.23131313131313
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -134        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 775         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008568798 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.819       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.536       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.33

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 76.52342342342342
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -108        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 74          |
|    time_elapsed         | 872         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008157363 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.786       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.533       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 2.46

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 79.72926829268293
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -79.6       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 82          |
|    time_elapsed         | 968         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.010012834 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.41       |
|    explained_variance   | 0.814       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.867       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 82.68
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -59.6       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 90          |
|    time_elapsed         | 1065        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.011046261 |
|    clip_fraction        | 0.212       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.31       |
|    explained_variance   | 0.763       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.697       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 2.41        |
--

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 85.27755102040817
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -46.6       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 98          |
|    time_elapsed         | 1159        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009912971 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.19       |
|    explained_variance   | 0.646       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.996       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.67

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.133333333333334
All assignments history: [5, 3, 8, 9, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 104      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -134.0
Standard deviation of reward: 0.0
Average successful assignments: 15.233333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -187         |
| time/                   |              |
|    fps                  | 96           |
|    iterations           | 2            |
|    time_elapsed         | 21           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0073570786 |
|    clip_fraction        | 0.0538       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.57        |
|    explained_variance   | -0.327       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.7          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0443      |
|    value

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 46.02
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 10          |
|    time_elapsed         | 109         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012155576 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.0822      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.935       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 5.23        |
--

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 60.87407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 18          |
|    time_elapsed         | 192         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010581568 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.468       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.88        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 4.2 

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 68.86923076923077
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 26          |
|    time_elapsed         | 277         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009729378 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.649       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.43        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 3.3 

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 70.93725490196078
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 34          |
|    time_elapsed         | 360         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008501049 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.3        |
|    explained_variance   | 0.737       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.535       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0481     |
|    value_loss           | 3.06

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 72.82539682539682
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 42          |
|    time_elapsed         | 445         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009645743 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6          |
|    explained_variance   | 0.74        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.964       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.054      |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 75.16933333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -147        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 50          |
|    time_elapsed         | 529         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009014297 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.48       |
|    explained_variance   | 0.716       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0539     |
|    value_loss           | 2.74

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 79.39540229885057
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 58          |
|    time_elapsed         | 611         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009780718 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.98       |
|    explained_variance   | 0.636       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.949       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 2.47

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 83.5929292929293
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -87.5       |
| time/                   |             |
|    fps                  | 98          |
|    iterations           | 66          |
|    time_elapsed         | 687         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009933963 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.51       |
|    explained_variance   | 0.538       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.055      |
|    value_loss           | 2.46 

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 87.00990990990991
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62.9       |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 74          |
|    time_elapsed         | 760         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008798756 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.26       |
|    explained_variance   | 0.542       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.902       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 2.77

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 90.22195121951219
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -43.4       |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 82          |
|    time_elapsed         | 833         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008910248 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.95       |
|    explained_variance   | 0.524       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.958       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0554     |
|    value_loss           | 2.72

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 93.11259259259259
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -29.7       |
| time/                   |             |
|    fps                  | 101         |
|    iterations           | 90          |
|    time_elapsed         | 906         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.010904227 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.99       |
|    explained_variance   | 0.551       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.865       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0529     |
|    value_loss           | 2.31

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 95.63265306122449
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -20.9       |
| time/                   |             |
|    fps                  | 102         |
|    iterations           | 98          |
|    time_elapsed         | 978         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008737117 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.81       |
|    explained_variance   | 0.45        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0511     |
|    value_loss           | 2.54

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 5.0
All assignments history: [8, 5, 8, 8, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 123      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -160.0
Standard deviation of reward: 0.0
Average successful assignments: 10.433333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007666588 |
|    clip_fraction        | 0.0633      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.57       |
|    explained_variance   | -0.134      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.79        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0455     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -56.0
Standard deviation of reward: 0.0
Average successful assignments: 29.893333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 10          |
|    time_elapsed         | 90          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011786274 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.53       |
|    explained_variance   | 0.141       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.71        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0478     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 50.41481481481482
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -185       |
| time/                   |            |
|    fps                  | 111        |
|    iterations           | 18         |
|    time_elapsed         | 165        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.01188707 |
|    clip_fraction        | 0.235      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.5       |
|    explained_variance   | 0.54       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.764      |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0536    |
|    value_loss           | 3.85       |
----------

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 61.05384615384615
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 26          |
|    time_elapsed         | 236         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008360706 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.738       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.773       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0491     |
|    value_loss           | 3.07

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 67.1156862745098
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -175       |
| time/                   |            |
|    fps                  | 111        |
|    iterations           | 34         |
|    time_elapsed         | 312        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00802996 |
|    clip_fraction        | 0.122      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.3       |
|    explained_variance   | 0.808      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.414      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0483    |
|    value_loss           | 2.52       |
-----------

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 71.44285714285714
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 42          |
|    time_elapsed         | 385         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009543264 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.01       |
|    explained_variance   | 0.831       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.406       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 2.15

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 75.55866666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -145        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 50          |
|    time_elapsed         | 458         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010339682 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.42       |
|    explained_variance   | 0.724       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.702       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0553     |
|    value_loss           | 2.62

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 79.61149425287357
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -117        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 58          |
|    time_elapsed         | 535         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009218902 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.667       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.785       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 83.22121212121212
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -88         |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 66          |
|    time_elapsed         | 624         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009845035 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.48       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 2.65

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 86.4927927927928
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -63.7       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 74          |
|    time_elapsed         | 695         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008657399 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.14       |
|    explained_variance   | 0.374       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0508     |
|    value_loss           | 3.01 

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 89.54878048780488
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -46.2       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 82          |
|    time_elapsed         | 774         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009144044 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.2        |
|    explained_variance   | 0.411       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0513     |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 92.28518518518518
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -35.7       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 90          |
|    time_elapsed         | 848         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009611669 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.12       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.736       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0525     |
|    value_loss           | 2.41

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 94.63061224489796
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -30.8      |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 98         |
|    time_elapsed         | 922        |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00988285 |
|    clip_fraction        | 0.214      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.13      |
|    explained_variance   | 0.438      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.967      |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0521    |
|    value_loss           | 2.57       |
----------

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.2
All assignments history: [5, 8, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 131      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 9.933333333333334
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -186       |
| time/                   |            |
|    fps                  | 126        |
|    iterations           | 2          |
|    time_elapsed         | 16         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00790639 |
|    clip_fraction        | 0.0681     |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.57      |
|    explained_variance   | -0.307     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.73       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0452    |
|    value_loss           | 17.8       |
--------

-------- Rollout Summary --------
Total mean reward: -108.0
Standard deviation of reward: 0.0
Average successful assignments: 22.686666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 10          |
|    time_elapsed         | 83          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012363927 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.54       |
|    explained_variance   | 0.0554      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0475     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 43.37407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 18          |
|    time_elapsed         | 150         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012311546 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | 0.499       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0559     |
|    value_loss           | 4.02 

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 50.13333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 26          |
|    time_elapsed         | 218         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009841571 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.669       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.04        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 3.52 

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 56.166666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 34          |
|    time_elapsed         | 286         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010932742 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.36       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.173       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0575     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 60.9
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 42          |
|    time_elapsed         | 354         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009868124 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.19       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.836       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0584     |
|    value_loss           | 2.64        |
---

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 64.26533333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -159        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 50          |
|    time_elapsed         | 427         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.012344133 |
|    clip_fraction        | 0.256       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.95       |
|    explained_variance   | 0.779       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.2         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0644     |
|    value_loss           | 1.98

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 67.64137931034483
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 58          |
|    time_elapsed         | 496         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009440011 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.52       |
|    explained_variance   | 0.692       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 2.65

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 70.76666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -121        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 66          |
|    time_elapsed         | 565         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008817221 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.12       |
|    explained_variance   | 0.63        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.514       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 2.26

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 73.97837837837838
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -98.5       |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 74          |
|    time_elapsed         | 632         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009026088 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.606       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.733       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0519     |
|    value_loss           | 2.5 

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 77.3089430894309
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -77.2       |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 82          |
|    time_elapsed         | 700         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008281272 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.51       |
|    explained_variance   | 0.591       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.995       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 2.69 

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 80.41851851851852
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -59.2        |
| time/                   |              |
|    fps                  | 120          |
|    iterations           | 90           |
|    time_elapsed         | 766          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0100174695 |
|    clip_fraction        | 0.207        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.31        |
|    explained_variance   | 0.525        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0529      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 83.27482993197279
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -45.4       |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 98          |
|    time_elapsed         | 833         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.010402965 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.633       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.726       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0524     |
|    value_loss           | 2.53