In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_svr_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    svr_model = SVR()
    svr_model.fit(X_train_scaled, y_train)
    return svr_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.1.csv')
svr_model, scaler = train_svr_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.1.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = svr_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 1.403494699070916
RMSE: 1.9135086312664382
R-squared: 0.9752350615552381
RAE: 0.14658131997050208


In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 21.25
All assignments history: [18, 17, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -365     |
| time/              |          |
|    fps             | 61       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 17.916666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 2           |
|    time_elapsed         | 35          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008046079 |
|    clip_fraction        | 0.0928      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.85       |
|    explained_variance   | -0.253      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.75        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0406     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 93.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 10          |
|    time_elapsed         | 180         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009843765 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.0205      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.34        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 4.01        |
--

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 128.59259259259258
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 330         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010510648 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.21        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.919       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0448     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 147.09615384615384
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -352         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 26           |
|    time_elapsed         | 481          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0073748855 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.5         |
|    explained_variance   | 0.327        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.912        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0407      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 157.47058823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -336        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 34          |
|    time_elapsed         | 632         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008403387 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | 0.315       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 167.23214285714286
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -312         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 42           |
|    time_elapsed         | 785          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0069080517 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.27        |
|    explained_variance   | 0.335        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.904        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0434      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 92.0
Standard deviation of reward: 0.0
Average successful assignments: 176.26166666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -285         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 50           |
|    time_elapsed         | 936          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0073735174 |
|    clip_fraction        | 0.152        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.96        |
|    explained_variance   | 0.327        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0459      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 184.10632183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -245        |
| time/                   |             |
|    fps                  | 53          |
|    iterations           | 58          |
|    time_elapsed         | 1102        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006306142 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.394       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.036      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 190.55429292929293
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -203         |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 66           |
|    time_elapsed         | 1269         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0060788086 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.44        |
|    explained_variance   | 0.498        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.15         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.037       |
|    value_

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 196.36148648648648
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -157         |
| time/                   |              |
|    fps                  | 52           |
|    iterations           | 74           |
|    time_elapsed         | 1434         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0071136374 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.442        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.9          |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0326      |
|    value_

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 201.46239837398375
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -117       |
| time/                   |            |
|    fps                  | 52         |
|    iterations           | 82         |
|    time_elapsed         | 1604       |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00620036 |
|    clip_fraction        | 0.0903     |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.22      |
|    explained_variance   | 0.452      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.28       |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.032     |
|    value_loss           | 4.14       |
--------

-------- Rollout Summary --------
Total mean reward: 130.0
Standard deviation of reward: 0.0
Average successful assignments: 205.73055555555555
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -87.4      |
| time/                   |            |
|    fps                  | 52         |
|    iterations           | 90         |
|    time_elapsed         | 1771       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00679866 |
|    clip_fraction        | 0.106      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.22      |
|    explained_variance   | 0.434      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.58       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0304    |
|    value_loss           | 4.03       |
--------

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 209.44642857142858
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -67.5       |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 98          |
|    time_elapsed         | 1927        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007196997 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.18       |
|    explained_variance   | 0.501       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.75        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0361     |
|    value_loss           | 3.

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -374.0
Standard deviation of reward: 0.0
Average successful assignments: 13.416666666666666
All assignments history: [15, 16, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 56       |
|    iterations      | 1        |
|    time_elapsed    | 18       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -152.0
Standard deviation of reward: 0.0
Average successful assignments: 59.708333333333336
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -368         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 2            |
|    time_elapsed         | 36           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0081891315 |
|    clip_fraction        | 0.0897       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.0873      |
|    learning_rate        | 0.00018      |
|    loss                 | 2.13         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0424      |
|    value

-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 114.91666666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -366       |
| time/                   |            |
|    fps                  | 54         |
|    iterations           | 10         |
|    time_elapsed         | 187        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01008973 |
|    clip_fraction        | 0.2        |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.78      |
|    explained_variance   | 0.00719    |
|    learning_rate        | 0.00018    |
|    loss                 | 1.23       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0395    |
|    value_loss           | 4.14       |
--------

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 141.99074074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 18          |
|    time_elapsed         | 340         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010669449 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.164       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.601       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0481     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 158.29166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -346        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 26          |
|    time_elapsed         | 487         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008955287 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.46       |
|    explained_variance   | 0.294       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0465     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 168.92156862745097
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 34          |
|    time_elapsed         | 633         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008465742 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.03       |
|    explained_variance   | 0.351       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.947       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 178.7718253968254
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -304        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 42          |
|    time_elapsed         | 783         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008951504 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.454       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0447     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 106.0
Standard deviation of reward: 0.0
Average successful assignments: 187.395
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -277        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 50          |
|    time_elapsed         | 929         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006867169 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.95       |
|    explained_variance   | 0.406       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 3.59        |

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 194.67097701149424
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -234         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 58           |
|    time_elapsed         | 1068         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0067179464 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.66        |
|    explained_variance   | 0.364        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.25         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0365      |
|    value_

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 201.09722222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -189        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 66          |
|    time_elapsed         | 1207        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.005517765 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.43       |
|    explained_variance   | 0.415       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0357     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 206.18693693693695
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -145         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 74           |
|    time_elapsed         | 1344         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0058221244 |
|    clip_fraction        | 0.0987       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.24        |
|    explained_variance   | 0.418        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.8          |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0353      |
|    value_

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 210.7428861788618
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -105         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 82           |
|    time_elapsed         | 1481         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0060691936 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.568        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0356      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 214.9638888888889
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -74.3      |
| time/                   |            |
|    fps                  | 56         |
|    iterations           | 90         |
|    time_elapsed         | 1620       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00670682 |
|    clip_fraction        | 0.13       |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.27      |
|    explained_variance   | 0.6        |
|    learning_rate        | 0.00018    |
|    loss                 | 1.55       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0399    |
|    value_loss           | 3.48       |
---------

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 218.70238095238096
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -55.9        |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 98           |
|    time_elapsed         | 1762         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0067667686 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.648        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.31         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.039       |
|    value_

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -372.0
Standard deviation of reward: 0.0
Average successful assignments: 13.583333333333334
All assignments history: [13, 10, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 62       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -300.0
Standard deviation of reward: 0.0
Average successful assignments: 29.0
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 66           |
|    iterations           | 2            |
|    time_elapsed         | 30           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0075588953 |
|    clip_fraction        | 0.0707       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.0784      |
|    learning_rate        | 0.00018      |
|    loss                 | 3.28         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0426      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -70.0
Standard deviation of reward: 0.0
Average successful assignments: 95.99166666666666
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -366       |
| time/                   |            |
|    fps                  | 60         |
|    iterations           | 10         |
|    time_elapsed         | 168        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.00952352 |
|    clip_fraction        | 0.164      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.79      |
|    explained_variance   | 0.00654    |
|    learning_rate        | 0.00018    |
|    loss                 | 1.69       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0388    |
|    value_loss           | 4.33       |
---------

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 133.15277777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 18          |
|    time_elapsed         | 301         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.007826412 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.131       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.84        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0384     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 152.03846153846155
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 26          |
|    time_elapsed         | 440         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008548416 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.38       |
|    explained_variance   | 0.295       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.33        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0444     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 163.68627450980392
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -323        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 34          |
|    time_elapsed         | 578         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007835533 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.73       |
|    explained_variance   | 0.535       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 173.3234126984127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -295        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 42          |
|    time_elapsed         | 717         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006977313 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.19       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0413     |
|    value_loss           | 3.07

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 182.28833333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -270         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 50           |
|    time_elapsed         | 856          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0064327726 |
|    clip_fraction        | 0.107        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.91        |
|    explained_variance   | 0.518        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.56         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0357      |
|    value_

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 190.33333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -228         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 58           |
|    time_elapsed         | 992          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0070815985 |
|    clip_fraction        | 0.119        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.67        |
|    explained_variance   | 0.388        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.77         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0406      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 197.41919191919192
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 66          |
|    time_elapsed         | 1125        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006218148 |
|    clip_fraction        | 0.0968      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.43       |
|    explained_variance   | 0.465       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.19        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0317     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 202.95945945945945
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -141         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 74           |
|    time_elapsed         | 1270         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0063604293 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.25        |
|    explained_variance   | 0.541        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.23         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0346      |
|    value_

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 208.0721544715447
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -103        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 82          |
|    time_elapsed         | 1415        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005923017 |
|    clip_fraction        | 0.0851      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.04       |
|    explained_variance   | 0.604       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.55        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0308     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 212.5185185185185
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -74.6        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1563         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0059606032 |
|    clip_fraction        | 0.084        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.01        |
|    explained_variance   | 0.607        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.96         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0269      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 216.1437074829932
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -53.2      |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 98         |
|    time_elapsed         | 1712       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00684129 |
|    clip_fraction        | 0.101      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.02      |
|    explained_variance   | 0.654      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.23       |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0272    |
|    value_loss           | 3.16       |
---------

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -392.0
Standard deviation of reward: 0.0
Average successful assignments: 5.666666666666667
All assignments history: [16, 12, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 59       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -322.0
Standard deviation of reward: 0.0
Average successful assignments: 20.25
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 2            |
|    time_elapsed         | 35           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0070834435 |
|    clip_fraction        | 0.0674       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.186       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.88         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0376      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 88.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -369        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 182         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009780252 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.0111      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.494       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0429     |
|    value_loss           | 3.91        |
---

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 129.18055555555554
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 331         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011024645 |
|    clip_fraction        | 0.242       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.176       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0393      |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0504     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 145.12179487179486
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -350        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 26          |
|    time_elapsed         | 480         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008932674 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.49       |
|    explained_variance   | 0.387       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.592       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0457     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 155.34558823529412
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -333        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 34          |
|    time_elapsed         | 620         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008946135 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.382       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0453     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 164.13492063492063
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -309        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 42          |
|    time_elapsed         | 769         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008980476 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.534       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.45        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0494     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 172.04833333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -283        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 50          |
|    time_elapsed         | 915         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007665363 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.92       |
|    explained_variance   | 0.588       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.57        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0436     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 179.39080459770116
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -242        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 58          |
|    time_elapsed         | 1066        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007832247 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.83       |
|    explained_variance   | 0.55        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0433     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 185.62373737373738
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -198         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 66           |
|    time_elapsed         | 1214         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0068490515 |
|    clip_fraction        | 0.138        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.63        |
|    explained_variance   | 0.442        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.1          |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0435      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 191.35923423423424
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -157         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 74           |
|    time_elapsed         | 1364         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0073952107 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.46        |
|    explained_variance   | 0.397        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.991        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0414      |
|    value_

-------- Rollout Summary --------
Total mean reward: 130.0
Standard deviation of reward: 0.0
Average successful assignments: 196.83231707317074
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -120         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 82           |
|    time_elapsed         | 1504         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0073426743 |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.41        |
|    explained_variance   | 0.338        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.28         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0401      |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 201.6861111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -89.5        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 90           |
|    time_elapsed         | 1646         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0066611883 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.25        |
|    explained_variance   | 0.36         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0401      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 206.0654761904762
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -67.6        |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 98           |
|    time_elapsed         | 1793         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0076984046 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.2         |
|    explained_variance   | 0.429        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.4          |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0365      |
|    value_l

done till here

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -348.0
Standard deviation of reward: 0.0
Average successful assignments: 23.5
All assignments history: [9, 13, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -378     |
| time/              |          |
|    fps             | 58       |
|    iterations      | 1        |
|    time_elapsed    | 17       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -394.0
Standard deviation of reward: 0.0
Average successful assignments: 14.0
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -377         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 2            |
|    time_elapsed         | 35           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0074215503 |
|    clip_fraction        | 0.0686       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.347       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.08         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0383      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 106.55
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 10          |
|    time_elapsed         | 180         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009874817 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.00482     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.784       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0397     |
|    value_loss           | 4.29        |
-

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 134.19907407407408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 18          |
|    time_elapsed         | 326         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010602453 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.15        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.538       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 150.31089743589743
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -346        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 26          |
|    time_elapsed         | 472         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007966049 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.32       |
|    explained_variance   | 0.401       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.594       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0388     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 160.4436274509804
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -325         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 34           |
|    time_elapsed         | 612          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0071863667 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.77        |
|    explained_variance   | 0.656        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.615        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0393      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 167.68253968253967
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -301         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 42           |
|    time_elapsed         | 747          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0076091024 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.36        |
|    explained_variance   | 0.753        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.971        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0365      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 173.46833333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -277         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 50           |
|    time_elapsed         | 876          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0072930986 |
|    clip_fraction        | 0.152        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.99        |
|    explained_variance   | 0.696        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.74         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0439      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 178.55603448275863
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -238         |
| time/                   |              |
|    fps                  | 59           |
|    iterations           | 58           |
|    time_elapsed         | 1004         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0079888785 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.76        |
|    explained_variance   | 0.59         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.08         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0404      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 183.08712121212122
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -199        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 66          |
|    time_elapsed         | 1133        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007677613 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.61       |
|    explained_variance   | 0.567       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.96        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.04       |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 187.1677927927928
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 60          |
|    iterations           | 74          |
|    time_elapsed         | 1258        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006762786 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.58       |
|    explained_variance   | 0.281       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.49        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0417     |
|    value_loss           | 3.64

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 191.2591463414634
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -133        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 82          |
|    time_elapsed         | 1370        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006986929 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.65       |
|    explained_variance   | 0.314       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 3.37

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 194.79814814814816
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -114         |
| time/                   |              |
|    fps                  | 62           |
|    iterations           | 90           |
|    time_elapsed         | 1481         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0071811043 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.63        |
|    explained_variance   | 0.315        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.02         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0391      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 198.02295918367346
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -101         |
| time/                   |              |
|    fps                  | 63           |
|    iterations           | 98           |
|    time_elapsed         | 1591         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0078307865 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.58        |
|    explained_variance   | 0.421        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.08         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0405      |
|    value_l

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 20.916666666666668
All assignments history: [14, 17, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 81       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -334.0
Standard deviation of reward: 0.0
Average successful assignments: 25.708333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007832173 |
|    clip_fraction        | 0.08        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.85       |
|    explained_variance   | -0.28       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0393     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -50.0
Standard deviation of reward: 0.0
Average successful assignments: 66.53333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 138         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010023255 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.00476     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 4.1

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 102.77777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 248         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009316595 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.17        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.115       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0401     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 122.94871794871794
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 26          |
|    time_elapsed         | 362         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008520516 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.46       |
|    explained_variance   | 0.317       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.707       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 137.60539215686273
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -330        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 34          |
|    time_elapsed         | 473         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009578742 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.01       |
|    explained_variance   | 0.563       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.55        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 148.46031746031747
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -306        |
| time/                   |             |
|    fps                  | 70          |
|    iterations           | 42          |
|    time_elapsed         | 606         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008084474 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.38       |
|    explained_variance   | 0.652       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 3.5

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 158.25666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -282        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 50          |
|    time_elapsed         | 712         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009018749 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.03       |
|    explained_variance   | 0.621       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.968       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0465     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 104.0
Standard deviation of reward: 0.0
Average successful assignments: 167.76580459770116
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -243         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 58           |
|    time_elapsed         | 831          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0077960715 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.8         |
|    explained_variance   | 0.47         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.041       |
|    value_

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 176.95959595959596
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -201        |
| time/                   |             |
|    fps                  | 71          |
|    iterations           | 66          |
|    time_elapsed         | 942         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006995175 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.63       |
|    explained_variance   | 0.35        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.037      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 185.01463963963963
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -157         |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 74           |
|    time_elapsed         | 1045         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0052828817 |
|    clip_fraction        | 0.0902       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.35        |
|    explained_variance   | 0.517        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0338      |
|    value_

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 191.85467479674796
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -117         |
| time/                   |              |
|    fps                  | 73           |
|    iterations           | 82           |
|    time_elapsed         | 1148         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0060542584 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.21        |
|    explained_variance   | 0.601        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.86         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0327      |
|    value_

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 197.60092592592594
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -84.8       |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 90          |
|    time_elapsed         | 1250        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005682368 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.11       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.53        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0333     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 202.39965986394557
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -61.4       |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 98          |
|    time_elapsed         | 1358        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006748544 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.1        |
|    explained_variance   | 0.59        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.54        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0312     |
|    value_loss           | 3.

In [19]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -372.0
Standard deviation of reward: 0.0
Average successful assignments: 14.166666666666666
All assignments history: [12, 18, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -370     |
| time/              |          |
|    fps             | 79       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -360.0
Standard deviation of reward: 0.0
Average successful assignments: 17.083333333333332
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -365         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 2            |
|    time_elapsed         | 25           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0075320583 |
|    clip_fraction        | 0.0724       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.108       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.6          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0398      |
|    value

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 122.75
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -363        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 10          |
|    time_elapsed         | 126         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009177519 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00674     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.039      |
|    value_loss           | 4.33        |
-

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 145.48611111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 18          |
|    time_elapsed         | 228         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010970933 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.68       |
|    explained_variance   | 0.106       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 2.85

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 154.98076923076923
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -345         |
| time/                   |              |
|    fps                  | 80           |
|    iterations           | 26           |
|    time_elapsed         | 329          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0076755024 |
|    clip_fraction        | 0.138        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.4         |
|    explained_variance   | 0.326        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.389        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0382      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 161.22794117647058
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -324         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 34           |
|    time_elapsed         | 429          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0068529267 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.77        |
|    explained_variance   | 0.421        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.18         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0395      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 167.55357142857142
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -298       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 42         |
|    time_elapsed         | 516        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00811589 |
|    clip_fraction        | 0.141      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.22      |
|    explained_variance   | 0.474      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.29       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0442    |
|    value_loss           | 2.85       |
---------

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 174.31166666666667
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -273         |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 50           |
|    time_elapsed         | 592          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0086049335 |
|    clip_fraction        | 0.157        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.86        |
|    explained_variance   | 0.413        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.961        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0425      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 181.50574712643677
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -232        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 58          |
|    time_elapsed         | 644         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006141404 |
|    clip_fraction        | 0.12        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.73       |
|    explained_variance   | 0.425       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 108.0
Standard deviation of reward: 0.0
Average successful assignments: 187.9570707070707
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -190        |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 66          |
|    time_elapsed         | 690         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.005426303 |
|    clip_fraction        | 0.087       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.66       |
|    explained_variance   | 0.393       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.82        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0346     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 193.92004504504504
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -150      |
| time/                   |           |
|    fps                  | 104       |
|    iterations           | 74        |
|    time_elapsed         | 725       |
|    total_timesteps      | 75776     |
| train/                  |           |
|    approx_kl            | 0.0070899 |
|    clip_fraction        | 0.117     |
|    clip_range           | 0.15      |
|    entropy_loss         | -3.57     |
|    explained_variance   | 0.39      |
|    learning_rate        | 0.00018   |
|    loss                 | 1.69      |
|    n_updates            | 730       |
|    policy_gradient_loss | -0.0339   |
|    value_loss           | 3.68      |
----------------------------

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 199.0508130081301
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -117        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 82          |
|    time_elapsed         | 760         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006079946 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.47       |
|    explained_variance   | 0.474       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.7         |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.036      |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 203.45277777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -94.8       |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 90          |
|    time_elapsed         | 795         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006992571 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.43       |
|    explained_variance   | 0.539       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0408     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 207.2908163265306
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -79.8       |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 98          |
|    time_elapsed         | 830         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006405322 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.36       |
|    explained_variance   | 0.606       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.66        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0383     |
|    value_loss           | 3.4

In [20]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -372.0
Standard deviation of reward: 0.0
Average successful assignments: 13.25
All assignments history: [11, 8, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -381     |
| time/              |          |
|    fps             | 263      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -300.0
Standard deviation of reward: 0.0
Average successful assignments: 28.541666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -378        |
| time/                   |             |
|    fps                  | 252         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007317029 |
|    clip_fraction        | 0.0731      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.85       |
|    explained_variance   | -0.473      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.59        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 2

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 114.81666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009561459 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.78       |
|    explained_variance   | 0.00798     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0407     |
|    value_loss           | 4.1

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 143.15740740740742
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009627849 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.138       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.12        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 155.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -343        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 26          |
|    time_elapsed         | 112         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008680624 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.35       |
|    explained_variance   | 0.228       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.541       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0435     |
|    value_loss           | 3.26        |
--

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 162.8235294117647
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -321        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 34          |
|    time_elapsed         | 146         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008315327 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.329       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.52        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0455     |
|    value_loss           | 2.73

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 170.66865079365078
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -291        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 42          |
|    time_elapsed         | 181         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006854207 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.04       |
|    explained_variance   | 0.481       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0392     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 179.46666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -263        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 50          |
|    time_elapsed         | 216         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007440774 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.77       |
|    explained_variance   | 0.459       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0394     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 188.0272988505747
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -219         |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 58           |
|    time_elapsed         | 250          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0063768565 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.67        |
|    explained_variance   | 0.287        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.13         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0366      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 195.15404040404042
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -174         |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 66           |
|    time_elapsed         | 285          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0060503157 |
|    clip_fraction        | 0.0919       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.37        |
|    explained_variance   | 0.425        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.21         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0324      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 201.6509009009009
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -128        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 74          |
|    time_elapsed         | 319         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006382173 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.13       |
|    explained_variance   | 0.447       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0308     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 207.3048780487805
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -91.1        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 82           |
|    time_elapsed         | 354          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0062741996 |
|    clip_fraction        | 0.12         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.455        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.58         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.032       |
|    value_l

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 212.1648148148148
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -64.4        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 90           |
|    time_elapsed         | 389          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0055425614 |
|    clip_fraction        | 0.092        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.02        |
|    explained_variance   | 0.53         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.06         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0311      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 216.40731292517006
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -46.2       |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 98          |
|    time_elapsed         | 424         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005619184 |
|    clip_fraction        | 0.0876      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.02       |
|    explained_variance   | 0.531       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0287     |
|    value_loss           | 3.

In [21]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -330.0
Standard deviation of reward: 0.0
Average successful assignments: 31.75
All assignments history: [13, 18, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -369     |
| time/              |          |
|    fps             | 254      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -276.0
Standard deviation of reward: 0.0
Average successful assignments: 42.833333333333336
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -371         |
| time/                   |              |
|    fps                  | 247          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0077091637 |
|    clip_fraction        | 0.0808       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.201       |
|    learning_rate        | 0.00018      |
|    loss                 | 3            |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0418      |
|    value

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 100.00833333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 241         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010028444 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | -0.00121    |
|    learning_rate        | 0.00018     |
|    loss                 | 2.32        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0436     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 130.99537037037038
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -361        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 18          |
|    time_elapsed         | 76          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010001699 |
|    clip_fraction        | 0.221       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.186       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.205       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0458     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 148.6346153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 240         |
|    iterations           | 26          |
|    time_elapsed         | 110         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008202145 |
|    clip_fraction        | 0.141       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.303       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0433     |
|    value_loss           | 3.14

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 160.00245098039215
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -326        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009106535 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.83       |
|    explained_variance   | 0.204       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0497     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 168.62896825396825
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -298        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 42          |
|    time_elapsed         | 180         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007380764 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.16       |
|    explained_variance   | 0.38        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.987       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0397     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 175.25166666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -272       |
| time/                   |            |
|    fps                  | 238        |
|    iterations           | 50         |
|    time_elapsed         | 214        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.00647291 |
|    clip_fraction        | 0.128      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.85      |
|    explained_variance   | 0.322      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.39       |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.0416    |
|    value_loss           | 3.13       |
---------

-------- Rollout Summary --------
Total mean reward: 106.0
Standard deviation of reward: 0.0
Average successful assignments: 182.31465517241378
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -230        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 58          |
|    time_elapsed         | 249         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006965695 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.74       |
|    explained_variance   | 0.303       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.042      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 189.1439393939394
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -187       |
| time/                   |            |
|    fps                  | 237        |
|    iterations           | 66         |
|    time_elapsed         | 284        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.00797602 |
|    clip_fraction        | 0.159      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.64      |
|    explained_variance   | 0.323      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.43       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0402    |
|    value_loss           | 3.09       |
---------

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 195.17680180180182
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -147        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 74          |
|    time_elapsed         | 320         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007988401 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.57       |
|    explained_variance   | 0.36        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 200.3668699186992
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -113        |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 82          |
|    time_elapsed         | 356         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.007036591 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.43       |
|    explained_variance   | 0.413       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.44        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 3.3

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 204.9787037037037
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -88.6        |
| time/                   |              |
|    fps                  | 235          |
|    iterations           | 90           |
|    time_elapsed         | 392          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0066631734 |
|    clip_fraction        | 0.135        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.26        |
|    explained_variance   | 0.505        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.28         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0413      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 209.13945578231292
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -72.8        |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 98           |
|    time_elapsed         | 427          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0059678108 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.17        |
|    explained_variance   | 0.618        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.48         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0358      |
|    value_

In [22]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 9.416666666666666
All assignments history: [8, 15, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 260      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -288.0
Standard deviation of reward: 0.0
Average successful assignments: 29.0
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -377         |
| time/                   |              |
|    fps                  | 246          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0074457815 |
|    clip_fraction        | 0.0658       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.85        |
|    explained_variance   | -0.19        |
|    learning_rate        | 0.00018      |
|    loss                 | 3.37         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0409      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 129.34166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009329675 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.00877     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.97        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 4.07

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 150.5648148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -362        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009600667 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.183       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.8         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0438     |
|    value_loss           | 3.01

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 161.0897435897436
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -349        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 26          |
|    time_elapsed         | 114         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008473956 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.49       |
|    explained_variance   | 0.337       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.64        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0443     |
|    value_loss           | 2.8 

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 168.99019607843138
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 34          |
|    time_elapsed         | 150         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008668801 |
|    clip_fraction        | 0.146       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.98       |
|    explained_variance   | 0.398       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.966       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 175.79365079365078
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -305        |
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 42          |
|    time_elapsed         | 185         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008229945 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.39       |
|    explained_variance   | 0.442       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.75        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 100.0
Standard deviation of reward: 0.0
Average successful assignments: 183.99666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -278        |
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 50          |
|    time_elapsed         | 221         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006726073 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.423       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 191.25574712643677
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -234        |
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 58          |
|    time_elapsed         | 257         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006395378 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.59       |
|    explained_variance   | 0.272       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0385     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 197.34722222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 231         |
|    iterations           | 66          |
|    time_elapsed         | 291         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.005611537 |
|    clip_fraction        | 0.0907      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.37       |
|    explained_variance   | 0.319       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0323     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 202.30180180180182
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -142         |
| time/                   |              |
|    fps                  | 233          |
|    iterations           | 74           |
|    time_elapsed         | 323          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0059178825 |
|    clip_fraction        | 0.089        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.17        |
|    explained_variance   | 0.461        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.31         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.031       |
|    value_

-------- Rollout Summary --------
Total mean reward: 116.0
Standard deviation of reward: 0.0
Average successful assignments: 206.5132113821138
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -101         |
| time/                   |              |
|    fps                  | 235          |
|    iterations           | 82           |
|    time_elapsed         | 356          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0058702664 |
|    clip_fraction        | 0.0924       |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.05        |
|    explained_variance   | 0.531        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.44         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0288      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 210.16296296296295
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -70.7        |
| time/                   |              |
|    fps                  | 239          |
|    iterations           | 90           |
|    time_elapsed         | 385          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0061684353 |
|    clip_fraction        | 0.106        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.09        |
|    explained_variance   | 0.583        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.34         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0357      |
|    value_

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 213.30527210884352
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -50.1       |
| time/                   |             |
|    fps                  | 242         |
|    iterations           | 98          |
|    time_elapsed         | 413         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006024462 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.12       |
|    explained_variance   | 0.619       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0389     |
|    value_loss           | 3.