In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_svr_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    svr_model = SVR()
    svr_model.fit(X_train_scaled, y_train)
    return svr_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
svr_model, scaler = train_svr_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = svr_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.6709803969580043
RMSE: 1.1802692335254894
R-squared: 0.9905780981775942
RAE: 0.07007735214500166


In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -390.0
Standard deviation of reward: 0.0
Average successful assignments: 6.583333333333333
All assignments history: [11, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 60       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -302.0
Standard deviation of reward: 0.0
Average successful assignments: 25.208333333333332
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -368       |
| time/                   |            |
|    fps                  | 57         |
|    iterations           | 2          |
|    time_elapsed         | 35         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00707292 |
|    clip_fraction        | 0.0615     |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.84      |
|    explained_variance   | -0.0705    |
|    learning_rate        | 0.00018    |
|    loss                 | 3.16       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0396    |
|    value_loss           | 17.1       |
-------

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 107.675
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 182         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010770481 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00612     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.373       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0442     |
|    value_loss           | 3.64        |


-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 148.71759259259258
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 334         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009144871 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.173       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.148       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 168.47435897435898
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -343       |
| time/                   |            |
|    fps                  | 54         |
|    iterations           | 26         |
|    time_elapsed         | 488        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00801563 |
|    clip_fraction        | 0.138      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.4       |
|    explained_variance   | 0.429      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.463      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0409    |
|    value_loss           | 2.69       |
--------

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 181.8014705882353
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -321        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 34          |
|    time_elapsed         | 644         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008447626 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.81       |
|    explained_variance   | 0.488       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.943       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 191.99603174603175
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -293        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 42          |
|    time_elapsed         | 799         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007842006 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.12       |
|    explained_variance   | 0.624       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0454     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 200.45
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -261         |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 50           |
|    time_elapsed         | 954          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0074232463 |
|    clip_fraction        | 0.14         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.58        |
|    explained_variance   | 0.723        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.872        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.042       |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 207.63793103448276
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -213         |
| time/                   |              |
|    fps                  | 52           |
|    iterations           | 58           |
|    time_elapsed         | 1120         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0075114286 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.36        |
|    explained_variance   | 0.662        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.832        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0381      |
|    value_

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 213.42550505050505
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -161         |
| time/                   |              |
|    fps                  | 52           |
|    iterations           | 66           |
|    time_elapsed         | 1291         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0071799853 |
|    clip_fraction        | 0.157        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.2         |
|    explained_variance   | 0.624        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.12         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0418      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 218.38288288288288
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -112         |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 74           |
|    time_elapsed         | 1458         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0051760958 |
|    clip_fraction        | 0.091        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.573        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.24         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0305      |
|    value_

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 222.7357723577236
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -71          |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 82           |
|    time_elapsed         | 1630         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0068676844 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.01        |
|    explained_variance   | 0.545        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0345      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 226.41203703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -41         |
| time/                   |             |
|    fps                  | 51          |
|    iterations           | 90          |
|    time_elapsed         | 1796        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006678763 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.01       |
|    explained_variance   | 0.471       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0379     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 229.5187074829932
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -24          |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 98           |
|    time_elapsed         | 1953         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0069877617 |
|    clip_fraction        | 0.141        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.492        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.62         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0417      |
|    value_l

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -380.0
Standard deviation of reward: 0.0
Average successful assignments: 10.666666666666666
All assignments history: [15, 13, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 58       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -364.0
Standard deviation of reward: 0.0
Average successful assignments: 13.875
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 2           |
|    time_elapsed         | 36          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007574724 |
|    clip_fraction        | 0.0737      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.118      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.1         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 15.9        |

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 99.08333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 184         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010300366 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00348     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.65        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0433     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 123.22222222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 18          |
|    time_elapsed         | 336         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009213578 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.227       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.92        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0408     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 140.41666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -342        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 26          |
|    time_elapsed         | 481         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008368617 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.349       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.77        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 92.0
Standard deviation of reward: 0.0
Average successful assignments: 157.52696078431373
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -320        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 34          |
|    time_elapsed         | 626         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009282824 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.429       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.907       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 171.15079365079364
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -294         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 42           |
|    time_elapsed         | 774          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0071404646 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.34        |
|    explained_variance   | 0.595        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.38         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0414      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 183.42833333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -266         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 50           |
|    time_elapsed         | 921          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0073637934 |
|    clip_fraction        | 0.14         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.87        |
|    explained_variance   | 0.697        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.27         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0448      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 193.78304597701148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -217        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 58          |
|    time_elapsed         | 1061        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006428725 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.37       |
|    explained_variance   | 0.603       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.08        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 202.67929292929293
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -165         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 66           |
|    time_elapsed         | 1199         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0068129674 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.528        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0384      |
|    value_

-------- Rollout Summary --------
Total mean reward: 174.0
Standard deviation of reward: 0.0
Average successful assignments: 210.3096846846847
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 74          |
|    time_elapsed         | 1336        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006110826 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.94       |
|    explained_variance   | 0.485       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0376     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 184.0
Standard deviation of reward: 0.0
Average successful assignments: 216.85365853658536
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -66.4        |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 82           |
|    time_elapsed         | 1472         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0061337715 |
|    clip_fraction        | 0.141        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.82        |
|    explained_variance   | 0.493        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0397      |
|    value_

-------- Rollout Summary --------
Total mean reward: 184.0
Standard deviation of reward: 0.0
Average successful assignments: 222.46666666666667
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -28.7     |
| time/                   |           |
|    fps                  | 57        |
|    iterations           | 90        |
|    time_elapsed         | 1612      |
|    total_timesteps      | 92160     |
| train/                  |           |
|    approx_kl            | 0.0057247 |
|    clip_fraction        | 0.112     |
|    clip_range           | 0.15      |
|    entropy_loss         | -2.66     |
|    explained_variance   | 0.419     |
|    learning_rate        | 0.00018   |
|    loss                 | 1.16      |
|    n_updates            | 890       |
|    policy_gradient_loss | -0.0345   |
|    value_loss           | 2.79      |
----------------------------

-------- Rollout Summary --------
Total mean reward: 186.0
Standard deviation of reward: 0.0
Average successful assignments: 227.17261904761904
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -1.1         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 98           |
|    time_elapsed         | 1752         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0061361436 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.78        |
|    explained_variance   | 0.448        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.45         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0362      |
|    value_

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -306.0
Standard deviation of reward: 0.0
Average successful assignments: 41.083333333333336
All assignments history: [18, 5, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 71       |
|    iterations      | 1        |
|    time_elapsed    | 14       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -362.0
Standard deviation of reward: 0.0
Average successful assignments: 29.625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 66          |
|    iterations           | 2           |
|    time_elapsed         | 31          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008143319 |
|    clip_fraction        | 0.0876      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.191      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0403     |
|    value_loss           | 16.9        |

-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 61.125
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 10          |
|    time_elapsed         | 168         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008898931 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.00865     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.601       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0371     |
|    value_loss           | 4.17        |

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 98.18055555555556
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -362         |
| time/                   |              |
|    fps                  | 61           |
|    iterations           | 18           |
|    time_elapsed         | 301          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0107940715 |
|    clip_fraction        | 0.216        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.7         |
|    explained_variance   | 0.0868       |
|    learning_rate        | 0.00018      |
|    loss                 | 0.06         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0455      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 126.96474358974359
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -352       |
| time/                   |            |
|    fps                  | 60         |
|    iterations           | 26         |
|    time_elapsed         | 439        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00840632 |
|    clip_fraction        | 0.148      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.49      |
|    explained_variance   | 0.323      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.97       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0421    |
|    value_loss           | 2.7        |
---------

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 144.58333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -335        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 34          |
|    time_elapsed         | 575         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008819554 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.12       |
|    explained_variance   | 0.419       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.775       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 159.43849206349208
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -310        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 42          |
|    time_elapsed         | 713         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007378944 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.5        |
|    explained_variance   | 0.486       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0412     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 173.61166666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -281        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 50          |
|    time_elapsed         | 853         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007877469 |
|    clip_fraction        | 0.148       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.03       |
|    explained_variance   | 0.539       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 185.35201149425288
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -235         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 58           |
|    time_elapsed         | 990          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0073135076 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.64        |
|    explained_variance   | 0.626        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.37         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0411      |
|    value_

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 195.2171717171717
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 66          |
|    time_elapsed         | 1126        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007629253 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.41       |
|    explained_variance   | 0.605       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0352     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 180.0
Standard deviation of reward: 0.0
Average successful assignments: 203.4144144144144
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 74          |
|    time_elapsed         | 1270        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006680745 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.27       |
|    explained_variance   | 0.443       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0356     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 182.0
Standard deviation of reward: 0.0
Average successful assignments: 210.3109756097561
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -94          |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 82           |
|    time_elapsed         | 1414         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0055541797 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.04        |
|    explained_variance   | 0.402        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.804        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0351      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 184.0
Standard deviation of reward: 0.0
Average successful assignments: 216.16203703703704
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -57.9        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1564         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0058142683 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.89        |
|    explained_variance   | 0.501        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.37         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0345      |
|    value_

-------- Rollout Summary --------
Total mean reward: 188.0
Standard deviation of reward: 0.0
Average successful assignments: 221.19387755102042
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -33.3        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 98           |
|    time_elapsed         | 1714         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0054976894 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.83        |
|    explained_variance   | 0.547        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.42         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0367      |
|    value_

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -338.0
Standard deviation of reward: 0.0
Average successful assignments: 27.333333333333332
All assignments history: [10, 8, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -382     |
| time/              |          |
|    fps             | 59       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -280.0
Standard deviation of reward: 0.0
Average successful assignments: 39.833333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -377        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008177381 |
|    clip_fraction        | 0.094       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.00323    |
|    learning_rate        | 0.00018     |
|    loss                 | 3.69        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0385     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 137.48333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 184         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010123108 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.76       |
|    explained_variance   | 0.0167      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.87        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0387     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 152.96296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -355        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 334         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010783102 |
|    clip_fraction        | 0.28        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.57       |
|    explained_variance   | 0.176       |
|    learning_rate        | 0.00018     |
|    loss                 | 2           |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.047      |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 159.17307692307693
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -339      |
| time/                   |           |
|    fps                  | 55        |
|    iterations           | 26        |
|    time_elapsed         | 483       |
|    total_timesteps      | 26624     |
| train/                  |           |
|    approx_kl            | 0.0081628 |
|    clip_fraction        | 0.136     |
|    clip_range           | 0.15      |
|    entropy_loss         | -5.21     |
|    explained_variance   | 0.334     |
|    learning_rate        | 0.00018   |
|    loss                 | 0.514     |
|    n_updates            | 250       |
|    policy_gradient_loss | -0.0373   |
|    value_loss           | 2.85      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 166.35049019607843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -316        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 34          |
|    time_elapsed         | 623         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009013325 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.61       |
|    explained_variance   | 0.475       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.9         |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0449     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 174.5952380952381
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -289        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 42          |
|    time_elapsed         | 771         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008877454 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.11       |
|    explained_variance   | 0.487       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.69        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0442     |
|    value_loss           | 3.14

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 182.19333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -263         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 50           |
|    time_elapsed         | 917          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0076437965 |
|    clip_fraction        | 0.174        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.82        |
|    explained_variance   | 0.459        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.045       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 189.53879310344828
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -220         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 58           |
|    time_elapsed         | 1067         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0077028014 |
|    clip_fraction        | 0.147        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.7         |
|    explained_variance   | 0.386        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.4          |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0405      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 196.44065656565655
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 66          |
|    time_elapsed         | 1215        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006065801 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.49       |
|    explained_variance   | 0.368       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0367     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 202.56306306306305
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -135         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 74           |
|    time_elapsed         | 1362         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0068308013 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.36        |
|    explained_variance   | 0.411        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.993        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0368      |
|    value_

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 207.864837398374
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -99.3        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 82           |
|    time_elapsed         | 1501         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0055898456 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.27        |
|    explained_variance   | 0.401        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.39         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.032       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 212.33703703703705
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -73.2        |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 90           |
|    time_elapsed         | 1643         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0076954104 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.424        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.21         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0379      |
|    value_

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 216.42091836734693
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -56.2      |
| time/                   |            |
|    fps                  | 56         |
|    iterations           | 98         |
|    time_elapsed         | 1785       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00714347 |
|    clip_fraction        | 0.14       |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.12      |
|    explained_variance   | 0.361      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.32       |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0381    |
|    value_loss           | 3.23       |
--------

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 21.333333333333332
All assignments history: [19, 17, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -364     |
| time/              |          |
|    fps             | 61       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -318.0
Standard deviation of reward: 0.0
Average successful assignments: 28.833333333333332
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -369         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 2            |
|    time_elapsed         | 35           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076047173 |
|    clip_fraction        | 0.0716       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.115       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.11         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0421      |
|    value

-------- Rollout Summary --------
Total mean reward: -46.0
Standard deviation of reward: 0.0
Average successful assignments: 102.71666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 10          |
|    time_elapsed         | 180         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009740794 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | -0.00405    |
|    learning_rate        | 0.00018     |
|    loss                 | 1.95        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 133.44444444444446
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 18          |
|    time_elapsed         | 326         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010630023 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.154       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.295       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0465     |
|    value_loss           | 2.75

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 149.08333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -349        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 26          |
|    time_elapsed         | 471         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.006374332 |
|    clip_fraction        | 0.0921      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.52       |
|    explained_variance   | 0.269       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 163.5857843137255
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -330        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 34          |
|    time_elapsed         | 613         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008560861 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.43        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0498     |
|    value_loss           | 2.6 

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 176.0793650793651
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -302         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 42           |
|    time_elapsed         | 748          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0071756495 |
|    clip_fraction        | 0.134        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.23        |
|    explained_variance   | 0.383        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.61         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0436      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 187.04166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -271        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 50          |
|    time_elapsed         | 876         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006929202 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.84       |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.59        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0352     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 196.19540229885058
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -225         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 58           |
|    time_elapsed         | 1004         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0072218273 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.55        |
|    explained_variance   | 0.425        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.53         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0419      |
|    value_

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 203.78535353535352
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -176         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 66           |
|    time_elapsed         | 1133         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0059719686 |
|    clip_fraction        | 0.0991       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.41        |
|    explained_variance   | 0.496        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.55         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0319      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 210.1204954954955
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -129         |
| time/                   |              |
|    fps                  | 60           |
|    iterations           | 74           |
|    time_elapsed         | 1260         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0060371766 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.43        |
|    explained_variance   | 0.451        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.99         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.04        |
|    value_l

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 215.390243902439
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -87.8      |
| time/                   |            |
|    fps                  | 61         |
|    iterations           | 82         |
|    time_elapsed         | 1370       |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00669647 |
|    clip_fraction        | 0.108      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.28      |
|    explained_variance   | 0.497      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.2        |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0348    |
|    value_loss           | 3.76       |
----------

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 219.88518518518518
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -57.6        |
| time/                   |              |
|    fps                  | 62           |
|    iterations           | 90           |
|    time_elapsed         | 1481         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0060589663 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.18        |
|    explained_variance   | 0.451        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.98         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0384      |
|    value_

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 223.84013605442178
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -39.1       |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 98          |
|    time_elapsed         | 1592        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006922554 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.18       |
|    explained_variance   | 0.479       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.63        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0365     |
|    value_loss           | 4.

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -364.0
Standard deviation of reward: 0.0
Average successful assignments: 16.916666666666668
All assignments history: [13, 10, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 79       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -342.0
Standard deviation of reward: 0.0
Average successful assignments: 22.083333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006894336 |
|    clip_fraction        | 0.0584      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.0914     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.8         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0383     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 122.86666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 136         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010207795 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00487     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.894       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0422     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 154.88888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 246         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009462609 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.177       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.19        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0426     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 171.64423076923077
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -344        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 26          |
|    time_elapsed         | 358         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008374213 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.41       |
|    explained_variance   | 0.296       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.724       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 183.34803921568627
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -322       |
| time/                   |            |
|    fps                  | 74         |
|    iterations           | 34         |
|    time_elapsed         | 467        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00825927 |
|    clip_fraction        | 0.138      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.83      |
|    explained_variance   | 0.323      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.84       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0426    |
|    value_loss           | 2.8        |
--------

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 194.10714285714286
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -294        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 42          |
|    time_elapsed         | 598         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009292789 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.13       |
|    explained_variance   | 0.404       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 203.925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -261        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 50          |
|    time_elapsed         | 703         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007279791 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.5        |
|    explained_variance   | 0.314       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.48        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0406     |
|    value_loss           | 3.33        |

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 211.85632183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -209        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 58          |
|    time_elapsed         | 820         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.005613707 |
|    clip_fraction        | 0.0924      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.14       |
|    explained_variance   | 0.405       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.55        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0334     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 218.22979797979798
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -156         |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 66           |
|    time_elapsed         | 929          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0060822126 |
|    clip_fraction        | 0.0979       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.01        |
|    explained_variance   | 0.479        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.41         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0326      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 223.356981981982
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -104       |
| time/                   |            |
|    fps                  | 73         |
|    iterations           | 74         |
|    time_elapsed         | 1030       |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.00456605 |
|    clip_fraction        | 0.0813     |
|    clip_range           | 0.15       |
|    entropy_loss         | -2.85      |
|    explained_variance   | 0.504      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.44       |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.028     |
|    value_loss           | 3.41       |
----------

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 227.6941056910569
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -58.6       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 82          |
|    time_elapsed         | 1131        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005002732 |
|    clip_fraction        | 0.0909      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.7        |
|    explained_variance   | 0.523       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0291     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 231.31296296296296
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -24.9        |
| time/                   |              |
|    fps                  | 74           |
|    iterations           | 90           |
|    time_elapsed         | 1233         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0054050917 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.73        |
|    explained_variance   | 0.602        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.01         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0342      |
|    value_

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 234.44557823129253
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -4.12       |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 98          |
|    time_elapsed         | 1338        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005588569 |
|    clip_fraction        | 0.0994      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.79       |
|    explained_variance   | 0.643       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.53        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0259     |
|    value_loss           | 3.

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.166666666666667
All assignments history: [17, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 81       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -110.0
Standard deviation of reward: 0.0
Average successful assignments: 65.375
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007828108 |
|    clip_fraction        | 0.0806      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.323      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0395     |
|    value_loss           | 18.4        |

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 144.35
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 127         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010399552 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00524     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.452       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0417     |
|    value_loss           | 3.9         |
-

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 158.62962962962962
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 18          |
|    time_elapsed         | 228         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011009565 |
|    clip_fraction        | 0.237       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.134       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.277       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0458     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 167.75320512820514
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 26          |
|    time_elapsed         | 329         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007160797 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.42       |
|    explained_variance   | 0.38        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.63        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0381     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 175.76960784313727
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -326        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 34          |
|    time_elapsed         | 428         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008382887 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.86       |
|    explained_variance   | 0.505       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.802       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0423     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 185.140873015873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -298        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 42          |
|    time_elapsed         | 516         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008085389 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.19       |
|    explained_variance   | 0.604       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.08        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0412     |
|    value_loss           | 3.13

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 194.86166666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -268        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 594         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006460952 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.668       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.892       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 202.81896551724137
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -220       |
| time/                   |            |
|    fps                  | 91         |
|    iterations           | 58         |
|    time_elapsed         | 648        |
|    total_timesteps      | 59392      |
| train/                  |            |
|    approx_kl            | 0.00705129 |
|    clip_fraction        | 0.12       |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.38      |
|    explained_variance   | 0.65       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.31       |
|    n_updates            | 570        |
|    policy_gradient_loss | -0.038     |
|    value_loss           | 3.36       |
--------

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 209.87247474747474
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -168         |
| time/                   |              |
|    fps                  | 97           |
|    iterations           | 66           |
|    time_elapsed         | 695          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0069625303 |
|    clip_fraction        | 0.114        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.489        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.46         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0352      |
|    value_

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 215.43355855855856
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 74          |
|    time_elapsed         | 732         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005970304 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.03       |
|    explained_variance   | 0.471       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.031      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 220.0721544715447
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -75.8       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 82          |
|    time_elapsed         | 766         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006337933 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.97       |
|    explained_variance   | 0.381       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 224.13703703703703
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -43.8       |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 90          |
|    time_elapsed         | 801         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006464553 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.83       |
|    explained_variance   | 0.459       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0332     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 227.60799319727892
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -22.6       |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 98          |
|    time_elapsed         | 835         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005478408 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.89       |
|    explained_variance   | 0.446       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0367     |
|    value_loss           | 3.

In [19]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -376.0
Standard deviation of reward: 0.0
Average successful assignments: 12.25
All assignments history: [12, 15, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 266      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -358.0
Standard deviation of reward: 0.0
Average successful assignments: 16.5
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -367         |
| time/                   |              |
|    fps                  | 256          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0062238574 |
|    clip_fraction        | 0.0437       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.84        |
|    explained_variance   | -0.124       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.95         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0366      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 108.85
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 244         |
|    iterations           | 10          |
|    time_elapsed         | 41          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009469774 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | -8.94e-05   |
|    learning_rate        | 0.00018     |
|    loss                 | 0.954       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 4.18        |
-

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 144.97685185185185
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009681193 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.0958      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.14        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 162.1602564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -346        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 26          |
|    time_elapsed         | 111         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008579057 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.45       |
|    explained_variance   | 0.287       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.8         |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.043      |
|    value_loss           | 2.81

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 172.7941176470588
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -326        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008298276 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.91       |
|    explained_variance   | 0.293       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.879       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 2.97

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 181.30753968253967
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -301        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 42          |
|    time_elapsed         | 180         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009174753 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.426       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0414     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 190.52333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -272         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 50           |
|    time_elapsed         | 214          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0074256468 |
|    clip_fraction        | 0.114        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.78        |
|    explained_variance   | 0.525        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.06         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.036       |
|    value_

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 198.85488505747125
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -224         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 58           |
|    time_elapsed         | 249          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0053247195 |
|    clip_fraction        | 0.099        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.38        |
|    explained_variance   | 0.432        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.832        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0374      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 205.85858585858585
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 66          |
|    time_elapsed         | 283         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.005710993 |
|    clip_fraction        | 0.118       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.02       |
|    explained_variance   | 0.352       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.77        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0368     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 212.02702702702703
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -123         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 74           |
|    time_elapsed         | 317          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0060529113 |
|    clip_fraction        | 0.1          |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.98        |
|    explained_variance   | 0.334        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.25         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0332      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 217.02337398373984
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -80.1       |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 82          |
|    time_elapsed         | 352         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006105784 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.97       |
|    explained_variance   | 0.295       |
|    learning_rate        | 0.00018     |
|    loss                 | 1           |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0356     |
|    value_loss           | 3 

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 221.35462962962964
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -45.5      |
| time/                   |            |
|    fps                  | 238        |
|    iterations           | 90         |
|    time_elapsed         | 386        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00746561 |
|    clip_fraction        | 0.128      |
|    clip_range           | 0.15       |
|    entropy_loss         | -2.89      |
|    explained_variance   | 0.319      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.3        |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0352    |
|    value_loss           | 3.41       |
--------

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 225.16496598639455
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -23.9       |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 98          |
|    time_elapsed         | 421         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006600689 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.86       |
|    explained_variance   | 0.371       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0368     |
|    value_loss           | 3.

In [20]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 10.166666666666666
All assignments history: [13, 19, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -368     |
| time/              |          |
|    fps             | 258      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -352.0
Standard deviation of reward: 0.0
Average successful assignments: 16.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -373        |
| time/                   |             |
|    fps                  | 249         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007978009 |
|    clip_fraction        | 0.0862      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.135      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.63        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 16.8        |
-

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 108.40833333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 241         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010147281 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.0188      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0433     |
|    value_loss           | 4.0

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 150.56944444444446
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -356        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 18          |
|    time_elapsed         | 76          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009250157 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.147       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.051       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 166.76923076923077
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -342         |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 26           |
|    time_elapsed         | 111          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0074556386 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.33        |
|    explained_variance   | 0.309        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.91         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0399      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 178.98529411764707
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -317         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 34           |
|    time_elapsed         | 145          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0076635513 |
|    clip_fraction        | 0.142        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.69        |
|    explained_variance   | 0.436        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.986        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.045       |
|    value_

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 189.04166666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -287         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 42           |
|    time_elapsed         | 180          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0070580663 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.97        |
|    explained_variance   | 0.531        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.14         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0439      |
|    value_

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 198.36833333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -254        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 50          |
|    time_elapsed         | 215         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007279274 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.51       |
|    explained_variance   | 0.529       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.34        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.038      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 207.32758620689654
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -203        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 58          |
|    time_elapsed         | 249         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006396707 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.21       |
|    explained_variance   | 0.549       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0388     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 214.9179292929293
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -148         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 66           |
|    time_elapsed         | 285          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0064802347 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.415        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.54         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0326      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 172.0
Standard deviation of reward: 0.0
Average successful assignments: 220.98536036036037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -97.1       |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 74          |
|    time_elapsed         | 321         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006421431 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.93       |
|    explained_variance   | 0.332       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.48        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0351     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 178.0
Standard deviation of reward: 0.0
Average successful assignments: 226.22560975609755
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -53.2       |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 82          |
|    time_elapsed         | 356         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006473557 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.87       |
|    explained_variance   | 0.305       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.57        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0339     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 184.0
Standard deviation of reward: 0.0
Average successful assignments: 230.7490740740741
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -22         |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 90          |
|    time_elapsed         | 392         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006777312 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.87       |
|    explained_variance   | 0.379       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.57        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0354     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 186.0
Standard deviation of reward: 0.0
Average successful assignments: 234.68112244897958
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -2           |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 98           |
|    time_elapsed         | 427          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0068788384 |
|    clip_fraction        | 0.142        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.83        |
|    explained_variance   | 0.498        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.18         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0391      |
|    value_

In [21]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 9.75
All assignments history: [10, 17, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 256      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -278.0
Standard deviation of reward: 0.0
Average successful assignments: 31.458333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007454997 |
|    clip_fraction        | 0.0792      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | -0.202      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.92        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0386     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -142.0
Standard deviation of reward: 0.0
Average successful assignments: 105.375
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011317259 |
|    clip_fraction        | 0.239       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.00743     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.746       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0447     |
|    value_loss           | 3.85        

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 134.00462962962962
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009719079 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.119       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.49        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 3.03

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 147.96794871794873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 26          |
|    time_elapsed         | 114         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007949028 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.38       |
|    explained_variance   | 0.365       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.628       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0385     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 159.76225490196077
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -320        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 34          |
|    time_elapsed         | 149         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008296595 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.79       |
|    explained_variance   | 0.608       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.719       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.042      |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 171.5079365079365
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -291         |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 42           |
|    time_elapsed         | 184          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0068046832 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.18        |
|    explained_variance   | 0.645        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0395      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 181.90166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -264        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 50          |
|    time_elapsed         | 220         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008614544 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.77       |
|    explained_variance   | 0.659       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.961       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0423     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 191.26005747126436
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -215         |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 58           |
|    time_elapsed         | 255          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0071016653 |
|    clip_fraction        | 0.145        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.45        |
|    explained_variance   | 0.477        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.04         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0411      |
|    value_

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 199.25757575757575
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -166        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 66          |
|    time_elapsed         | 290         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007295126 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.27       |
|    explained_variance   | 0.376       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.707       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 206.06193693693695
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 74          |
|    time_elapsed         | 322         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008009113 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.12       |
|    explained_variance   | 0.357       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.977       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0387     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 212.1138211382114
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -76.5        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 82           |
|    time_elapsed         | 354          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0064136665 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.95        |
|    explained_variance   | 0.417        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.14         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0373      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 217.28703703703704
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -44.5        |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 90           |
|    time_elapsed         | 384          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0078089545 |
|    clip_fraction        | 0.153        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.82        |
|    explained_variance   | 0.479        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.26         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0355      |
|    value_

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 221.61904761904762
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -20         |
| time/                   |             |
|    fps                  | 242         |
|    iterations           | 98          |
|    time_elapsed         | 413         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006241399 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.83       |
|    explained_variance   | 0.363       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0327     |
|    value_loss           | 3.