In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_ridge_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    ridge_model = Ridge()
    ridge_model.fit(X_train_scaled, y_train)
    return ridge_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
ridge_model, scaler = train_ridge_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = ridge_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.45497986053091444
RMSE: 0.5737672522389483
R-squared: 0.997773370716979
RAE: 0.04751820477894557


In [2]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -98.0
Standard deviation of reward: 0.0
Average successful assignments: 2.6
All assignments history: [2, 5, 7, 4, 6, 4, 5, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -91.6    |
| time/              |          |
|    fps             | 664      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 3.55
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -90.8       |
| time/                   |             |
|    fps                  | 604         |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006986077 |
|    clip_fraction        | 0.0797      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | -0.37       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0331     |
|    value_loss           | 14.9        |
--

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 20.495
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -89.6        |
| time/                   |              |
|    fps                  | 573          |
|    iterations           | 10           |
|    time_elapsed         | 17           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0074089253 |
|    clip_fraction        | 0.113        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.96        |
|    explained_variance   | 0.276        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.93         |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0371      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 26.78611111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -83.8       |
| time/                   |             |
|    fps                  | 569         |
|    iterations           | 18          |
|    time_elapsed         | 32          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.005883266 |
|    clip_fraction        | 0.0982      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.84       |
|    explained_variance   | 0.796       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0365     |
|    value_loss           | 3.45

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 30.638461538461538
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -71.6        |
| time/                   |              |
|    fps                  | 566          |
|    iterations           | 26           |
|    time_elapsed         | 46           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0063409703 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.5         |
|    explained_variance   | 0.83         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.47         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0393      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 34.21764705882353
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -50.2        |
| time/                   |              |
|    fps                  | 565          |
|    iterations           | 34           |
|    time_elapsed         | 61           |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0073040696 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.72        |
|    explained_variance   | 0.797        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.02         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0363      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 38.1
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -18.5        |
| time/                   |              |
|    fps                  | 555          |
|    iterations           | 42           |
|    time_elapsed         | 77           |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0061048893 |
|    clip_fraction        | 0.138        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.93        |
|    explained_variance   | 0.835        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.677        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0359      |
|    value_loss           

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 41.757
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 4.02         |
| time/                   |              |
|    fps                  | 553          |
|    iterations           | 50           |
|    time_elapsed         | 92           |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0048720352 |
|    clip_fraction        | 0.0923       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.36        |
|    explained_variance   | 0.846        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.09         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0268      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 44.88879310344828
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 16.5        |
| time/                   |             |
|    fps                  | 552         |
|    iterations           | 58          |
|    time_elapsed         | 107         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.003863777 |
|    clip_fraction        | 0.0873      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.13       |
|    explained_variance   | 0.887       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.453       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0245     |
|    value_loss           | 1.78

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 47.52045454545455
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 23.6         |
| time/                   |              |
|    fps                  | 548          |
|    iterations           | 66           |
|    time_elapsed         | 123          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0039434694 |
|    clip_fraction        | 0.0831       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.92        |
|    explained_variance   | 0.925        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.507        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0261      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 49.73783783783784
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 29.5        |
| time/                   |             |
|    fps                  | 547         |
|    iterations           | 74          |
|    time_elapsed         | 138         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005200545 |
|    clip_fraction        | 0.088       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.89       |
|    explained_variance   | 0.94        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0228     |
|    value_loss           | 1.02

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 51.44634146341463
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 29.4         |
| time/                   |              |
|    fps                  | 547          |
|    iterations           | 82           |
|    time_elapsed         | 153          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0055013793 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.97        |
|    explained_variance   | 0.929        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.373        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0246      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 52.88
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 30.1        |
| time/                   |             |
|    fps                  | 544         |
|    iterations           | 90          |
|    time_elapsed         | 169         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.004479411 |
|    clip_fraction        | 0.0875      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.92       |
|    explained_variance   | 0.945       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0229     |
|    value_loss           | 0.958       |
--

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 54.093877551020405
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 31.2        |
| time/                   |             |
|    fps                  | 534         |
|    iterations           | 98          |
|    time_elapsed         | 187         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009457154 |
|    clip_fraction        | 0.123       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.98       |
|    explained_variance   | 0.942       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.184       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0284     |
|    value_loss           | 1.0

In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 2.75
All assignments history: [1, 2, 3, 8, 3, 2, 6, 2, 3, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -93      |
| time/              |          |
|    fps             | 551      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 2.775
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -92.9        |
| time/                   |              |
|    fps                  | 520          |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0060715135 |
|    clip_fraction        | 0.0592       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.02        |
|    explained_variance   | -0.315       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.41         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0312      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 19.16
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -90.4       |
| time/                   |             |
|    fps                  | 440         |
|    iterations           | 10          |
|    time_elapsed         | 23          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009528417 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.95       |
|    explained_variance   | 0.163       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.39        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0404     |
|    value_loss           | 6.3         |
---

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 26.336111111111112
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -82.7       |
| time/                   |             |
|    fps                  | 386         |
|    iterations           | 18          |
|    time_elapsed         | 47          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.005675991 |
|    clip_fraction        | 0.0731      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.81       |
|    explained_variance   | 0.657       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.71        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0334     |
|    value_loss           | 4.6

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 31.30576923076923
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -67          |
| time/                   |              |
|    fps                  | 361          |
|    iterations           | 26           |
|    time_elapsed         | 73           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0068676644 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.33        |
|    explained_variance   | 0.784        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.05         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0366      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 35.675
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -42.3       |
| time/                   |             |
|    fps                  | 352         |
|    iterations           | 34          |
|    time_elapsed         | 98          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.006982392 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.53       |
|    explained_variance   | 0.766       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.832       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0427     |
|    value_loss           | 2.78        |
-

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 39.88809523809524
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -12.8       |
| time/                   |             |
|    fps                  | 355         |
|    iterations           | 42          |
|    time_elapsed         | 120         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008686046 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.85       |
|    explained_variance   | 0.79        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.942       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 2.72

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 43.252
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 3.76         |
| time/                   |              |
|    fps                  | 356          |
|    iterations           | 50           |
|    time_elapsed         | 143          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0048656696 |
|    clip_fraction        | 0.083        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.56        |
|    explained_variance   | 0.843        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.728        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.027       |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 46.014655172413796
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 14          |
| time/                   |             |
|    fps                  | 357         |
|    iterations           | 58          |
|    time_elapsed         | 166         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004668699 |
|    clip_fraction        | 0.108       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.28       |
|    explained_variance   | 0.859       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.755       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0291     |
|    value_loss           | 2.1

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 48.26969696969697
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 20.2        |
| time/                   |             |
|    fps                  | 357         |
|    iterations           | 66          |
|    time_elapsed         | 188         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.003874706 |
|    clip_fraction        | 0.0728      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.22       |
|    explained_variance   | 0.871       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.841       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0252     |
|    value_loss           | 2.08

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 50.054054054054056
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 21.3        |
| time/                   |             |
|    fps                  | 358         |
|    iterations           | 74          |
|    time_elapsed         | 211         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.004437351 |
|    clip_fraction        | 0.0999      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.14       |
|    explained_variance   | 0.892       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.539       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0271     |
|    value_loss           | 1.8

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 51.58170731707317
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 24.1        |
| time/                   |             |
|    fps                  | 359         |
|    iterations           | 82          |
|    time_elapsed         | 233         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004325694 |
|    clip_fraction        | 0.0827      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.1        |
|    explained_variance   | 0.898       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.568       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0246     |
|    value_loss           | 1.75

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 52.88611111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 26.1         |
| time/                   |              |
|    fps                  | 359          |
|    iterations           | 90           |
|    time_elapsed         | 256          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0046202336 |
|    clip_fraction        | 0.0943       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.04        |
|    explained_variance   | 0.922        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.384        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0273      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 54.03775510204082
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 26.9         |
| time/                   |              |
|    fps                  | 360          |
|    iterations           | 98           |
|    time_elapsed         | 278          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0042031407 |
|    clip_fraction        | 0.086        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.08        |
|    explained_variance   | 0.931        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.193        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0256      |
|    value_lo

In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -88.0
Standard deviation of reward: 0.0
Average successful assignments: 4.9
All assignments history: [4, 6, 2, 0, 4, 6, 4, 5, 4, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -92.4    |
| time/              |          |
|    fps             | 435      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -90.0
Standard deviation of reward: 0.0
Average successful assignments: 4.875
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -91.5       |
| time/                   |             |
|    fps                  | 398         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006746405 |
|    clip_fraction        | 0.0619      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | -0.301      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.4         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.033      |
|    value_loss           | 18.7        |
-

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 18.11
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -89.4        |
| time/                   |              |
|    fps                  | 375          |
|    iterations           | 10           |
|    time_elapsed         | 27           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0077791093 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.96        |
|    explained_variance   | 0.155        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.85         |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0378      |
|    value_loss          

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 25.225
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -84.8       |
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 18          |
|    time_elapsed         | 49          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.007937529 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.83       |
|    explained_variance   | 0.761       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 3.72        |
-

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 29.759615384615383
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -73.1        |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 26           |
|    time_elapsed         | 73           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0067337034 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.49        |
|    explained_variance   | 0.811        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.35         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0387      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 33.5735294117647
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -51.2       |
| time/                   |             |
|    fps                  | 360         |
|    iterations           | 34          |
|    time_elapsed         | 96          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007552649 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.81       |
|    explained_variance   | 0.803       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.877       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 2.57 

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 37.53928571428571
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -22.5        |
| time/                   |              |
|    fps                  | 362          |
|    iterations           | 42           |
|    time_elapsed         | 118          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0061817644 |
|    clip_fraction        | 0.125        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.98        |
|    explained_variance   | 0.789        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.848        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0336      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 41.235
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 0.74        |
| time/                   |             |
|    fps                  | 364         |
|    iterations           | 50          |
|    time_elapsed         | 140         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.005065658 |
|    clip_fraction        | 0.0845      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.51       |
|    explained_variance   | 0.849       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.525       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0287     |
|    value_loss           | 1.97        |
-

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 44.331896551724135
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 11.9        |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 58          |
|    time_elapsed         | 162         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004208299 |
|    clip_fraction        | 0.0857      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.2        |
|    explained_variance   | 0.858       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.679       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0273     |
|    value_loss           | 1.9

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 46.92272727272727
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 20.3         |
| time/                   |              |
|    fps                  | 366          |
|    iterations           | 66           |
|    time_elapsed         | 184          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0043616667 |
|    clip_fraction        | 0.0787       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.06        |
|    explained_variance   | 0.895        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.601        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0215      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 49.13310810810811
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 26           |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 74           |
|    time_elapsed         | 208          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0046556983 |
|    clip_fraction        | 0.0902       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.92        |
|    explained_variance   | 0.917        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.486        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.024       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 51.014634146341464
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 29.5         |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 82           |
|    time_elapsed         | 231          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0043614395 |
|    clip_fraction        | 0.092        |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.86        |
|    explained_variance   | 0.925        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.363        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0254      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 52.67888888888889
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 32.3         |
| time/                   |              |
|    fps                  | 358          |
|    iterations           | 90           |
|    time_elapsed         | 257          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0043445453 |
|    clip_fraction        | 0.0717       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.88        |
|    explained_variance   | 0.929        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.31         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0208      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 54.10408163265306
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 34.2        |
| time/                   |             |
|    fps                  | 359         |
|    iterations           | 98          |
|    time_elapsed         | 279         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007152699 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.87       |
|    explained_variance   | 0.916       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.351       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0261     |
|    value_loss           | 1.16

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 3.1
All assignments history: [5, 3, 3, 4, 3, 5, 3, 3, 7, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -91.6    |
| time/              |          |
|    fps             | 366      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -66.0
Standard deviation of reward: 0.0
Average successful assignments: 6.925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -91.3       |
| time/                   |             |
|    fps                  | 372         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007508908 |
|    clip_fraction        | 0.101       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | 0.067       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.69        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0375     |
|    value_loss           | 13.4        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 18.345
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -89.6       |
| time/                   |             |
|    fps                  | 339         |
|    iterations           | 10          |
|    time_elapsed         | 30          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.007810382 |
|    clip_fraction        | 0.135       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.97       |
|    explained_variance   | 0.249       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0379     |
|    value_loss           | 6.62        |
--

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 24.169444444444444
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -85.8        |
| time/                   |              |
|    fps                  | 336          |
|    iterations           | 18           |
|    time_elapsed         | 54           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0062584453 |
|    clip_fraction        | 0.0943       |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.88        |
|    explained_variance   | 0.639        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.06         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0366      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 27.630769230769232
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -76.6        |
| time/                   |              |
|    fps                  | 328          |
|    iterations           | 26           |
|    time_elapsed         | 80           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0067687957 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.56        |
|    explained_variance   | 0.794        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0391      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 31.75735294117647
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -53.2        |
| time/                   |              |
|    fps                  | 318          |
|    iterations           | 34           |
|    time_elapsed         | 109          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0081238095 |
|    clip_fraction        | 0.133        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.85        |
|    explained_variance   | 0.727        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.18         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0377      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 35.99404761904762
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -22.7        |
| time/                   |              |
|    fps                  | 309          |
|    iterations           | 42           |
|    time_elapsed         | 139          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0068442095 |
|    clip_fraction        | 0.138        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.05        |
|    explained_variance   | 0.76         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.927        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.037       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 39.927
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 2.12         |
| time/                   |              |
|    fps                  | 305          |
|    iterations           | 50           |
|    time_elapsed         | 167          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0049159545 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.49        |
|    explained_variance   | 0.818        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.881        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.031       |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 43.07672413793104
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 13.1         |
| time/                   |              |
|    fps                  | 300          |
|    iterations           | 58           |
|    time_elapsed         | 197          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0043830294 |
|    clip_fraction        | 0.0763       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.31        |
|    explained_variance   | 0.843        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.751        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0214      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 45.6
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 18          |
| time/                   |             |
|    fps                  | 292         |
|    iterations           | 66          |
|    time_elapsed         | 230         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.003364246 |
|    clip_fraction        | 0.0827      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.14       |
|    explained_variance   | 0.883       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.666       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0251     |
|    value_loss           | 1.89        |
---

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 47.7
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 22.3        |
| time/                   |             |
|    fps                  | 284         |
|    iterations           | 74          |
|    time_elapsed         | 266         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.003845436 |
|    clip_fraction        | 0.0844      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.05       |
|    explained_variance   | 0.921       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.47        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0232     |
|    value_loss           | 1.49        |
---

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 49.51341463414634
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 25          |
| time/                   |             |
|    fps                  | 277         |
|    iterations           | 82          |
|    time_elapsed         | 302         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005040629 |
|    clip_fraction        | 0.0872      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.88       |
|    explained_variance   | 0.943       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.24        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0253     |
|    value_loss           | 1.05

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 51.06666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 27.7        |
| time/                   |             |
|    fps                  | 271         |
|    iterations           | 90          |
|    time_elapsed         | 338         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.004259509 |
|    clip_fraction        | 0.0936      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.92       |
|    explained_variance   | 0.93        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.179       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.027      |
|    value_loss           | 1.24

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 52.429591836734694
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 29.8         |
| time/                   |              |
|    fps                  | 265          |
|    iterations           | 98           |
|    time_elapsed         | 378          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0047206995 |
|    clip_fraction        | 0.0988       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.86        |
|    explained_variance   | 0.95         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.12         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0255      |
|    value_l

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -82.0
Standard deviation of reward: 0.0
Average successful assignments: 6.15
All assignments history: [2, 5, 4, 5, 6, 2, 2, 1, 3, 3, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -93.4    |
| time/              |          |
|    fps             | 237      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -62.0
Standard deviation of reward: 0.0
Average successful assignments: 8.775
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -92.9        |
| time/                   |              |
|    fps                  | 220          |
|    iterations           | 2            |
|    time_elapsed         | 9            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0061978074 |
|    clip_fraction        | 0.0599       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.02        |
|    explained_variance   | -0.136       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.55         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0315      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 19.235
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -89.9        |
| time/                   |              |
|    fps                  | 203          |
|    iterations           | 10           |
|    time_elapsed         | 50           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0075360667 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.97        |
|    explained_variance   | 0.27         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.4          |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0393      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 24.17222222222222
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -84.7        |
| time/                   |              |
|    fps                  | 207          |
|    iterations           | 18           |
|    time_elapsed         | 88           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0064984714 |
|    clip_fraction        | 0.0996       |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.84        |
|    explained_variance   | 0.765        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.31         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0364      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 28.315384615384616
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -73.9       |
| time/                   |             |
|    fps                  | 209         |
|    iterations           | 26          |
|    time_elapsed         | 126         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007253642 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.56       |
|    explained_variance   | 0.872       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0404     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 32.247058823529414
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -52.8       |
| time/                   |             |
|    fps                  | 211         |
|    iterations           | 34          |
|    time_elapsed         | 164         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008323364 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.82       |
|    explained_variance   | 0.769       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0449     |
|    value_loss           | 3.0

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 36.194047619047616
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -23.1        |
| time/                   |              |
|    fps                  | 212          |
|    iterations           | 42           |
|    time_elapsed         | 202          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0073488783 |
|    clip_fraction        | 0.164        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.04        |
|    explained_variance   | 0.738        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.22         |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0401      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 39.843
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -1.3         |
| time/                   |              |
|    fps                  | 210          |
|    iterations           | 50           |
|    time_elapsed         | 242          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0058612013 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.58        |
|    explained_variance   | 0.801        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.798        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0286      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 43.00948275862069
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 10.1         |
| time/                   |              |
|    fps                  | 210          |
|    iterations           | 58           |
|    time_elapsed         | 281          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0049323766 |
|    clip_fraction        | 0.0949       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.4         |
|    explained_variance   | 0.811        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.72         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.028       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 45.696969696969695
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 17.7         |
| time/                   |              |
|    fps                  | 211          |
|    iterations           | 66           |
|    time_elapsed         | 319          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0058189696 |
|    clip_fraction        | 0.101        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.24        |
|    explained_variance   | 0.866        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.633        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0255      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 47.91148648648649
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 22.4         |
| time/                   |              |
|    fps                  | 212          |
|    iterations           | 74           |
|    time_elapsed         | 356          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0046209125 |
|    clip_fraction        | 0.0818       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.11        |
|    explained_variance   | 0.912        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.366        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0233      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 49.78414634146341
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 26.2        |
| time/                   |             |
|    fps                  | 215         |
|    iterations           | 82          |
|    time_elapsed         | 389         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004738368 |
|    clip_fraction        | 0.0833      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.04       |
|    explained_variance   | 0.895       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.409       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0267     |
|    value_loss           | 1.65

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 51.37277777777778
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 100        |
|    ep_rew_mean          | 28         |
| time/                   |            |
|    fps                  | 218        |
|    iterations           | 90         |
|    time_elapsed         | 422        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00519994 |
|    clip_fraction        | 0.108      |
|    clip_range           | 0.15       |
|    entropy_loss         | -2.02      |
|    explained_variance   | 0.91       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.425      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0284    |
|    value_loss           | 1.36       |
----------

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 52.71632653061224
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 29.2        |
| time/                   |             |
|    fps                  | 220         |
|    iterations           | 98          |
|    time_elapsed         | 455         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005021119 |
|    clip_fraction        | 0.0968      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.97       |
|    explained_variance   | 0.931       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.21        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0248     |
|    value_loss           | 0.97

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -100.0
Standard deviation of reward: 0.0
Average successful assignments: 2.15
All assignments history: [4, 3, 6, 4, 6, 2, 5, 6, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -91.4    |
| time/              |          |
|    fps             | 304      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 5.4
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -91.4       |
| time/                   |             |
|    fps                  | 278         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006848591 |
|    clip_fraction        | 0.0665      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | -0.0769     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.94        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0328     |
|    value_loss           | 13.9        |
---

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 23.315
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -88.7       |
| time/                   |             |
|    fps                  | 249         |
|    iterations           | 10          |
|    time_elapsed         | 41          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009091396 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.94       |
|    explained_variance   | 0.389       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.41        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0439     |
|    value_loss           | 4.97        |
-

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 28.605555555555554
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -82.2        |
| time/                   |              |
|    fps                  | 245          |
|    iterations           | 18           |
|    time_elapsed         | 74           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0071374867 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.8         |
|    explained_variance   | 0.787        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.17         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0379      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 32.16538461538462
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -69.6        |
| time/                   |              |
|    fps                  | 247          |
|    iterations           | 26           |
|    time_elapsed         | 107          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0075247018 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.38        |
|    explained_variance   | 0.854        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.717        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0393      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 36.00147058823529
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -44          |
| time/                   |              |
|    fps                  | 249          |
|    iterations           | 34           |
|    time_elapsed         | 139          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0065081646 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.59        |
|    explained_variance   | 0.795        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.22         |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0375      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 39.75119047619047
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -15.7       |
| time/                   |             |
|    fps                  | 244         |
|    iterations           | 42          |
|    time_elapsed         | 175         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007199224 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.89       |
|    explained_variance   | 0.771       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0406     |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 43.249
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 4.74         |
| time/                   |              |
|    fps                  | 245          |
|    iterations           | 50           |
|    time_elapsed         | 208          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0049142563 |
|    clip_fraction        | 0.0931       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.51        |
|    explained_variance   | 0.775        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.32         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0281      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 46.1
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 14.7         |
| time/                   |              |
|    fps                  | 244          |
|    iterations           | 58           |
|    time_elapsed         | 243          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0047612465 |
|    clip_fraction        | 0.0995       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.3         |
|    explained_variance   | 0.848        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.874        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0274      |
|    value_loss           

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 48.59469696969697
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 22.2         |
| time/                   |              |
|    fps                  | 242          |
|    iterations           | 66           |
|    time_elapsed         | 278          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0038122053 |
|    clip_fraction        | 0.0605       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.14        |
|    explained_variance   | 0.876        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.716        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0212      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 50.63716216216216
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 26.2         |
| time/                   |              |
|    fps                  | 243          |
|    iterations           | 74           |
|    time_elapsed         | 311          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0043813046 |
|    clip_fraction        | 0.0787       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.99        |
|    explained_variance   | 0.888        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.497        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0223      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 52.36585365853659
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 29          |
| time/                   |             |
|    fps                  | 246         |
|    iterations           | 82          |
|    time_elapsed         | 340         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.005050987 |
|    clip_fraction        | 0.0925      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.98       |
|    explained_variance   | 0.918       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.407       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.022      |
|    value_loss           | 1.26

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 53.836666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 31.2         |
| time/                   |              |
|    fps                  | 250          |
|    iterations           | 90           |
|    time_elapsed         | 368          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0049069067 |
|    clip_fraction        | 0.0917       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.98        |
|    explained_variance   | 0.901        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.568        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0263      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 55.09591836734694
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 100        |
|    ep_rew_mean          | 32.4       |
| time/                   |            |
|    fps                  | 255        |
|    iterations           | 98         |
|    time_elapsed         | 392        |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00718562 |
|    clip_fraction        | 0.136      |
|    clip_range           | 0.15       |
|    entropy_loss         | -1.91      |
|    explained_variance   | 0.934      |
|    learning_rate        | 0.00018    |
|    loss                 | 1          |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0276    |
|    value_loss           | 0.916      |
----------

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -86.0
Standard deviation of reward: 0.0
Average successful assignments: 5.9
All assignments history: [5, 3, 8, 3, 5, 6, 5, 4, 4, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -90.4    |
| time/              |          |
|    fps             | 436      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 7.225
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -91.1        |
| time/                   |              |
|    fps                  | 397          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0063776365 |
|    clip_fraction        | 0.0612       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.02        |
|    explained_variance   | 0.237        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.92         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0336      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 18.245
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -89.6       |
| time/                   |             |
|    fps                  | 374         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008859288 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.96       |
|    explained_variance   | 0.329       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.06        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 5.45        |
-

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 23.57777777777778
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -84.9        |
| time/                   |              |
|    fps                  | 373          |
|    iterations           | 18           |
|    time_elapsed         | 49           |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0076025235 |
|    clip_fraction        | 0.145        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.85        |
|    explained_variance   | 0.786        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.03         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0406      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 28.130769230769232
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -73.6        |
| time/                   |              |
|    fps                  | 371          |
|    iterations           | 26           |
|    time_elapsed         | 71           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0079469485 |
|    clip_fraction        | 0.137        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.58        |
|    explained_variance   | 0.858        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.09         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0424      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 32.46470588235294
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -53         |
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 34          |
|    time_elapsed         | 93          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007842616 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.91       |
|    explained_variance   | 0.801       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.824       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0428     |
|    value_loss           | 2.93

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 36.76190476190476
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -23         |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 42          |
|    time_elapsed         | 115         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.005858787 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.07       |
|    explained_variance   | 0.843       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.701       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0351     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 40.507
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -1.06        |
| time/                   |              |
|    fps                  | 371          |
|    iterations           | 50           |
|    time_elapsed         | 137          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0054885256 |
|    clip_fraction        | 0.111        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.61        |
|    explained_variance   | 0.835        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.621        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0301      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 43.724137931034484
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 13.6        |
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 58          |
|    time_elapsed         | 160         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004473529 |
|    clip_fraction        | 0.081       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.17       |
|    explained_variance   | 0.892       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0264     |
|    value_loss           | 1.7

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 46.43333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 22.6        |
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 66          |
|    time_elapsed         | 182         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.004237742 |
|    clip_fraction        | 0.0893      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.95       |
|    explained_variance   | 0.912       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.502       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0248     |
|    value_loss           | 1.41

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 48.78243243243243
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 100        |
|    ep_rew_mean          | 28.3       |
| time/                   |            |
|    fps                  | 372        |
|    iterations           | 74         |
|    time_elapsed         | 203        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.00470504 |
|    clip_fraction        | 0.0924     |
|    clip_range           | 0.15       |
|    entropy_loss         | -1.89      |
|    explained_variance   | 0.931      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.228      |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.025     |
|    value_loss           | 1.07       |
----------

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 50.73719512195122
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 100        |
|    ep_rew_mean          | 31.1       |
| time/                   |            |
|    fps                  | 371        |
|    iterations           | 82         |
|    time_elapsed         | 226        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00522885 |
|    clip_fraction        | 0.0979     |
|    clip_range           | 0.15       |
|    entropy_loss         | -1.86      |
|    explained_variance   | 0.941      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.786      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.027     |
|    value_loss           | 0.884      |
----------

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 52.388888888888886
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 33           |
| time/                   |              |
|    fps                  | 370          |
|    iterations           | 90           |
|    time_elapsed         | 248          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0050236816 |
|    clip_fraction        | 0.0951       |
|    clip_range           | 0.15         |
|    entropy_loss         | -1.86        |
|    explained_variance   | 0.951        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.155        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0241      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 53.78928571428571
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 34.1        |
| time/                   |             |
|    fps                  | 370         |
|    iterations           | 98          |
|    time_elapsed         | 271         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006544858 |
|    clip_fraction        | 0.111       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.78       |
|    explained_variance   | 0.957       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0173      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0275     |
|    value_loss           | 0.61

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 4.05
All assignments history: [5, 5, 5, 5, 5, 1, 4, 3, 6, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -91.8    |
| time/              |          |
|    fps             | 424      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -64.0
Standard deviation of reward: 0.0
Average successful assignments: 7.575
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -91.7       |
| time/                   |             |
|    fps                  | 401         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.006361192 |
|    clip_fraction        | 0.071       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | -0.127      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.26        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0326     |
|    value_loss           | 14.8        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 21.47
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -88.9       |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.007988105 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.95       |
|    explained_variance   | 0.246       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.55        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0404     |
|    value_loss           | 6.22        |
---

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 27.16111111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -82.3       |
| time/                   |             |
|    fps                  | 366         |
|    iterations           | 18          |
|    time_elapsed         | 50          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.007189689 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.797       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.935       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 3.23

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 30.89423076923077
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -70.2        |
| time/                   |              |
|    fps                  | 365          |
|    iterations           | 26           |
|    time_elapsed         | 72           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0064216405 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.41        |
|    explained_variance   | 0.855        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.85         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0395      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 34.436764705882354
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -48.9        |
| time/                   |              |
|    fps                  | 365          |
|    iterations           | 34           |
|    time_elapsed         | 95           |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0075704493 |
|    clip_fraction        | 0.201        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.66        |
|    explained_variance   | 0.802        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.973        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0474      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 38.195238095238096
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -20.1        |
| time/                   |              |
|    fps                  | 365          |
|    iterations           | 42           |
|    time_elapsed         | 117          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0061571817 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.91        |
|    explained_variance   | 0.842        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.739        |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0334      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 41.561
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 1.68         |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 50           |
|    time_elapsed         | 140          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0055228192 |
|    clip_fraction        | 0.112        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.43        |
|    explained_variance   | 0.881        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.408        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0313      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 44.25258620689655
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 11.4        |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 58          |
|    time_elapsed         | 163         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004170131 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.23       |
|    explained_variance   | 0.911       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.377       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0304     |
|    value_loss           | 1.41

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 46.403030303030306
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 15.9        |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 66          |
|    time_elapsed         | 185         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.003861435 |
|    clip_fraction        | 0.0768      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.14       |
|    explained_variance   | 0.927       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.265       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0252     |
|    value_loss           | 1.1

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 48.18783783783784
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 18.6         |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 74           |
|    time_elapsed         | 208          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0045748586 |
|    clip_fraction        | 0.0998       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.1         |
|    explained_variance   | 0.927        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.962        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0297      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 49.75792682926829
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 21.1         |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 82           |
|    time_elapsed         | 230          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0055428557 |
|    clip_fraction        | 0.11         |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.07        |
|    explained_variance   | 0.925        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.202        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0316      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 51.10611111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 23.1         |
| time/                   |              |
|    fps                  | 363          |
|    iterations           | 90           |
|    time_elapsed         | 253          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0049646497 |
|    clip_fraction        | 0.0958       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.05        |
|    explained_variance   | 0.933        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.18         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0273      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 52.30408163265306
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 24.7        |
| time/                   |             |
|    fps                  | 363         |
|    iterations           | 98          |
|    time_elapsed         | 276         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006441596 |
|    clip_fraction        | 0.0909      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.99       |
|    explained_variance   | 0.952       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0948      |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0254     |
|    value_loss           | 0.66

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -100.0
Standard deviation of reward: 0.0
Average successful assignments: 2.3
All assignments history: [6, 7, 2, 4, 0, 7, 4, 5, 7, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -90.8    |
| time/              |          |
|    fps             | 383      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -60.0
Standard deviation of reward: 0.0
Average successful assignments: 7.225
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -91.1        |
| time/                   |              |
|    fps                  | 376          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0071851453 |
|    clip_fraction        | 0.0924       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.02        |
|    explained_variance   | -0.0624      |
|    learning_rate        | 0.00018      |
|    loss                 | 2.92         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0357      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 18.465
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -88.7       |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008031938 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.94       |
|    explained_variance   | 0.237       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.89        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0381     |
|    value_loss           | 5.82        |
--

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 26.7
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -82.1       |
| time/                   |             |
|    fps                  | 373         |
|    iterations           | 18          |
|    time_elapsed         | 49          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.006960498 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.81       |
|    explained_variance   | 0.77        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.695       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0364     |
|    value_loss           | 3.4         |
---

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 30.840384615384615
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -70.3       |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 26          |
|    time_elapsed         | 71          |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.006799252 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.42       |
|    explained_variance   | 0.828       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.994       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.04       |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 34.90882352941176
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -45.3       |
| time/                   |             |
|    fps                  | 371         |
|    iterations           | 34          |
|    time_elapsed         | 93          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008883871 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.68       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.773       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 2.75

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 38.917857142857144
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -15.6       |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 42          |
|    time_elapsed         | 116         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006229544 |
|    clip_fraction        | 0.124       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.89       |
|    explained_variance   | 0.789       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.931       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0381     |
|    value_loss           | 2.4

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 42.444
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 5.42         |
| time/                   |              |
|    fps                  | 368          |
|    iterations           | 50           |
|    time_elapsed         | 138          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0052911304 |
|    clip_fraction        | 0.0972       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.49        |
|    explained_variance   | 0.834        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.687        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0302      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 45.35086206896552
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 15.3        |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 58          |
|    time_elapsed         | 161         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004409626 |
|    clip_fraction        | 0.079       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.22       |
|    explained_variance   | 0.896       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.494       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0274     |
|    value_loss           | 1.62

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 47.88333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 22.7         |
| time/                   |              |
|    fps                  | 368          |
|    iterations           | 66           |
|    time_elapsed         | 183          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0055714166 |
|    clip_fraction        | 0.0928       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.09        |
|    explained_variance   | 0.93         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.05         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0242      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 50.020270270270274
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 27.8        |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 74          |
|    time_elapsed         | 205         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005118493 |
|    clip_fraction        | 0.0874      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.95       |
|    explained_variance   | 0.932       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.209       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0275     |
|    value_loss           | 1.1

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 51.80853658536585
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 30.9        |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 82          |
|    time_elapsed         | 227         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004824507 |
|    clip_fraction        | 0.0957      |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.88       |
|    explained_variance   | 0.951       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.169       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.026      |
|    value_loss           | 0.87

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 53.32666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 33          |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 90          |
|    time_elapsed         | 249         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.005843224 |
|    clip_fraction        | 0.109       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.85       |
|    explained_variance   | 0.956       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.174       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0318     |
|    value_loss           | 0.80

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 54.61887755102041
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 33.9        |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 98          |
|    time_elapsed         | 272         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005522723 |
|    clip_fraction        | 0.107       |
|    clip_range           | 0.15        |
|    entropy_loss         | -1.88       |
|    explained_variance   | 0.947       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.108       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0288     |
|    value_loss           | 0.84

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks100.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -96.0
Standard deviation of reward: 0.0
Average successful assignments: 3.2
All assignments history: [4, 1, 5, 4, 6, 9, 4, 2, 5, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -91.2    |
| time/              |          |
|    fps             | 471      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 5.875
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 100        |
|    ep_rew_mean          | -91.5      |
| time/                   |            |
|    fps                  | 413        |
|    iterations           | 2          |
|    time_elapsed         | 4          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00621165 |
|    clip_fraction        | 0.0601     |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.02      |
|    explained_variance   | -0.0863    |
|    learning_rate        | 0.00018    |
|    loss                 | 2.29       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0332    |
|    value_loss           | 15.4       |
---------------------

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 18.275
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -89.7       |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 10          |
|    time_elapsed         | 27          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.008205964 |
|    clip_fraction        | 0.138       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.96       |
|    explained_variance   | 0.241       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.39        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0404     |
|    value_loss           | 6.16        |


-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 22.919444444444444
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -85.5       |
| time/                   |             |
|    fps                  | 369         |
|    iterations           | 18          |
|    time_elapsed         | 49          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.006190623 |
|    clip_fraction        | 0.0823      |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.774       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.879       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0364     |
|    value_loss           | 3.92

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 27.040384615384614
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | -74.5        |
| time/                   |              |
|    fps                  | 368          |
|    iterations           | 26           |
|    time_elapsed         | 72           |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0063279225 |
|    clip_fraction        | 0.0988       |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.58        |
|    explained_variance   | 0.833        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.83         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0369      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 31.354411764705883
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -53.6       |
| time/                   |             |
|    fps                  | 367         |
|    iterations           | 34          |
|    time_elapsed         | 94          |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007065517 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.9        |
|    explained_variance   | 0.827       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.12        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.037      |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 35.59404761904762
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -23.9       |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 42          |
|    time_elapsed         | 116         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006487024 |
|    clip_fraction        | 0.14        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.1        |
|    explained_variance   | 0.776       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 2.9 

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 39.407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -1.28       |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 50          |
|    time_elapsed         | 138         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.005017938 |
|    clip_fraction        | 0.0841      |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.55       |
|    explained_variance   | 0.815       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0292     |
|    value_loss           | 2.52        |
-

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 42.60689655172414
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 11.6        |
| time/                   |             |
|    fps                  | 368         |
|    iterations           | 58          |
|    time_elapsed         | 161         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.004724918 |
|    clip_fraction        | 0.093       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.34       |
|    explained_variance   | 0.791       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.953       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0276     |
|    value_loss           | 2.69

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 45.3
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 18.3         |
| time/                   |              |
|    fps                  | 368          |
|    iterations           | 66           |
|    time_elapsed         | 183          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0045389794 |
|    clip_fraction        | 0.076        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.21        |
|    explained_variance   | 0.858        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.24         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0233      |
|    value_loss           

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 47.5722972972973
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 24.4         |
| time/                   |              |
|    fps                  | 367          |
|    iterations           | 74           |
|    time_elapsed         | 205          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0050403783 |
|    clip_fraction        | 0.0902       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.1         |
|    explained_variance   | 0.893        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.537        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0235      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 49.447560975609754
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 26.2         |
| time/                   |              |
|    fps                  | 367          |
|    iterations           | 82           |
|    time_elapsed         | 228          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0054631755 |
|    clip_fraction        | 0.107        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.12        |
|    explained_variance   | 0.888        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.16         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0283      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 51.05222222222222
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 100          |
|    ep_rew_mean          | 27.5         |
| time/                   |              |
|    fps                  | 366          |
|    iterations           | 90           |
|    time_elapsed         | 251          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0050321273 |
|    clip_fraction        | 0.0896       |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.04        |
|    explained_variance   | 0.9          |
|    learning_rate        | 0.00018      |
|    loss                 | 0.408        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0262      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 52.425
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | 28.9        |
| time/                   |             |
|    fps                  | 365         |
|    iterations           | 98          |
|    time_elapsed         | 274         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.005855756 |
|    clip_fraction        | 0.115       |
|    clip_range           | 0.15        |
|    entropy_loss         | -2.01       |
|    explained_variance   | 0.912       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0275     |
|    value_loss           | 1.44        |
-