In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_ridge_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    ridge_model = Ridge()
    ridge_model.fit(X_train_scaled, y_train)
    return ridge_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.1.csv')
ridge_model, scaler = train_ridge_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.1.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = ridge_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.8967544998516832
RMSE: 1.1358026956536529
R-squared: 0.991274663358528
RAE: 0.09365725311592739


In [2]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.2
All assignments history: [9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 110      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -182.0
Standard deviation of reward: 0.0
Average successful assignments: 7.566666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008488427 |
|    clip_fraction        | 0.0861      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.239      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.95        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 17

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 33.14
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 10          |
|    time_elapsed         | 116         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012888368 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.092       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.81        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0485     |
|    value_loss           | 5.15        |
--

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 46.61851851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 18          |
|    time_elapsed         | 214         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.014049556 |
|    clip_fraction        | 0.286       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.422       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.43        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0556     |
|    value_loss           | 4.7

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 52.86923076923077
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 26          |
|    time_elapsed         | 314         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009683468 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.33       |
|    explained_variance   | 0.638       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.511       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 3.68 

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 57.08627450980392
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 34          |
|    time_elapsed         | 419         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008386911 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.18       |
|    explained_variance   | 0.698       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.878       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 3.12

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 60.72063492063492
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -160        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 42          |
|    time_elapsed         | 521         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009487495 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.84       |
|    explained_variance   | 0.716       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0546     |
|    value_loss           | 2.91 

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 64.468
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 50          |
|    time_elapsed         | 620         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008570911 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.38       |
|    explained_variance   | 0.727       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.524       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.45        |
-

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 68.59540229885057
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -119        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 58          |
|    time_elapsed         | 726         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007846071 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.92       |
|    explained_variance   | 0.721       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.892       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0539     |
|    value_loss           | 2.49

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 72.6959595959596
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -97.1       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 66          |
|    time_elapsed         | 831         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010396188 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.648       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.84        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.6  

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 76.47657657657658
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -81.9       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 74          |
|    time_elapsed         | 934         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009321095 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.65       |
|    explained_variance   | 0.649       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.63        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0505     |
|    value_loss           | 2.76

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 79.98130081300813
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -68.7       |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 82          |
|    time_elapsed         | 1039        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009328337 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.617       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.917       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 83.11333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -58        |
| time/                   |            |
|    fps                  | 79         |
|    iterations           | 90         |
|    time_elapsed         | 1156       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00927287 |
|    clip_fraction        | 0.194      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.28      |
|    explained_variance   | 0.651      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.876      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0527    |
|    value_loss           | 2.43       |
----------

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 86.00680272108843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -48.4       |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 98          |
|    time_elapsed         | 1268        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009218095 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.13       |
|    explained_variance   | 0.671       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.775       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.41

In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.266666666666667
All assignments history: [8, 5, 6, 4, 11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 84       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 10.5
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007112926 |
|    clip_fraction        | 0.056       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.169      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.59        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0428     |
|    value_loss           | 17.2        |
-

-------- Rollout Summary --------
Total mean reward: -80.0
Standard deviation of reward: 0.0
Average successful assignments: 25.34
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 137         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010676812 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.0913      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.924       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 5.48        |
-

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 40.9
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 18          |
|    time_elapsed         | 247         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012968586 |
|    clip_fraction        | 0.249       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.402       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.13        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 4.69        |
--

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 49.54358974358974
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 73         |
|    iterations           | 26         |
|    time_elapsed         | 359        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.01155787 |
|    clip_fraction        | 0.219      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.4       |
|    explained_variance   | 0.482      |
|    learning_rate        | 0.00018    |
|    loss                 | 3.83       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0511    |
|    value_loss           | 4.76       |
-----------

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 55.61764705882353
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -178        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 34          |
|    time_elapsed         | 469         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009321662 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.33       |
|    explained_variance   | 0.561       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0466     |
|    value_loss           | 4.29

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 59.833333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -171        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 42          |
|    time_elapsed         | 571         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008680664 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.19       |
|    explained_variance   | 0.652       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0471     |
|    value_loss           | 3.5

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 63.714666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -160        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 50          |
|    time_elapsed         | 676         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008357694 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.9        |
|    explained_variance   | 0.664       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.805       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0488     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 66.85862068965517
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 58          |
|    time_elapsed         | 778         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008936231 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.43       |
|    explained_variance   | 0.751       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.943       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 70.28989898989899
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 66          |
|    time_elapsed         | 879         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009068435 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.701       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.657       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.27

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 73.86666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -95.3       |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 74          |
|    time_elapsed         | 982         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009529559 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.75       |
|    explained_variance   | 0.665       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.48        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0524     |
|    value_loss           | 2.05

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 77.17560975609756
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -77.7       |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 82          |
|    time_elapsed         | 1082        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008525464 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.55       |
|    explained_variance   | 0.681       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.66        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0502     |
|    value_loss           | 2.01

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 80.50888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -60.8       |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 90          |
|    time_elapsed         | 1180        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008702802 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.16       |
|    explained_variance   | 0.658       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.436       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0518     |
|    value_loss           | 1.92

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 83.74285714285715
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -42.2      |
| time/                   |            |
|    fps                  | 78         |
|    iterations           | 98         |
|    time_elapsed         | 1281       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00912341 |
|    clip_fraction        | 0.196      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.01      |
|    explained_variance   | 0.634      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.933      |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0526    |
|    value_loss           | 2.37       |
----------

In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 3.533333333333333
All assignments history: [8, 7, 10, 2, 6, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 93       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -120.0
Standard deviation of reward: 0.0
Average successful assignments: 15.966666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007983511 |
|    clip_fraction        | 0.0721      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.16       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.31        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 34.82
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -186       |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 10         |
|    time_elapsed         | 120        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01152302 |
|    clip_fraction        | 0.191      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.45      |
|    explained_variance   | 0.0545     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.21       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0442    |
|    value_loss           | 5.56       |
---------------------

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 47.20740740740741
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 18          |
|    time_elapsed         | 218         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012244075 |
|    clip_fraction        | 0.239       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.497       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0512     |
|    value_loss           | 4.02

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 54.66923076923077
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 26          |
|    time_elapsed         | 318         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009749154 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.35       |
|    explained_variance   | 0.687       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.755       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0519     |
|    value_loss           | 3.41 

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 59.245098039215684
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 34          |
|    time_elapsed         | 410         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009664878 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.21       |
|    explained_variance   | 0.772       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.13        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 2.76

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 62.89523809523809
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 42          |
|    time_elapsed         | 501         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008528083 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.95       |
|    explained_variance   | 0.788       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.712       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 2.53

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 66.50533333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -145        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 592         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009905327 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.54       |
|    explained_variance   | 0.769       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.766       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0583     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 70.26896551724138
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -121        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 58          |
|    time_elapsed         | 682         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009425869 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.03       |
|    explained_variance   | 0.773       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.815       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 2.24

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 74.02626262626262
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -94.4       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 771         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010668375 |
|    clip_fraction        | 0.232       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.73       |
|    explained_variance   | 0.641       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.788       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0588     |
|    value_loss           | 2.06

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 77.61891891891892
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -74.3       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 74          |
|    time_elapsed         | 861         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010153053 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.56       |
|    explained_variance   | 0.747       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.633       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 1.94

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 80.78861788617886
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -61.8       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 950         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008898566 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.781       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.632       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 2.33

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 83.71629629629629
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -50.5       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 90          |
|    time_elapsed         | 1043        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009511631 |
|    clip_fraction        | 0.221       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.19       |
|    explained_variance   | 0.71        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.562       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0564     |
|    value_loss           | 1.94

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 86.3687074829932
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -38.9       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 98          |
|    time_elapsed         | 1137        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009614797 |
|    clip_fraction        | 0.216       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.03       |
|    explained_variance   | 0.689       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.781       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0554     |
|    value_loss           | 2.15 

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.6
All assignments history: [7, 6, 4, 8, 9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -144.0
Standard deviation of reward: 0.0
Average successful assignments: 13.166666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -187         |
| time/                   |              |
|    fps                  | 96           |
|    iterations           | 2            |
|    time_elapsed         | 21           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076977387 |
|    clip_fraction        | 0.0646       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.5         |
|    explained_variance   | -0.358       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.6          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.046       |
|    value

-------- Rollout Summary --------
Total mean reward: -12.0
Standard deviation of reward: 0.0
Average successful assignments: 43.4
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 10          |
|    time_elapsed         | 110         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011716496 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.0314      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.86        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0481     |
|    value_loss           | 5.2         |
--

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 54.95925925925926
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 18          |
|    time_elapsed         | 200         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011960281 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.401       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.67        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 4.39 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 60.54102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 26          |
|    time_elapsed         | 287         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.011498246 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.546       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.81        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 4.32

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 65.31960784313725
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 34          |
|    time_elapsed         | 377         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010364192 |
|    clip_fraction        | 0.19        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.3        |
|    explained_variance   | 0.623       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.78        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0533     |
|    value_loss           | 3.88

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 69.11111111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 42          |
|    time_elapsed         | 469         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010258092 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.17       |
|    explained_variance   | 0.65        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 3.39

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 72.04933333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -157         |
| time/                   |              |
|    fps                  | 91           |
|    iterations           | 50           |
|    time_elapsed         | 560          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0082891835 |
|    clip_fraction        | 0.16         |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.93        |
|    explained_variance   | 0.653        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.24         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0521      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 74.42068965517241
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -142        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 58          |
|    time_elapsed         | 653         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008016368 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.57       |
|    explained_variance   | 0.607       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.686       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 3.14

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 77.16868686868686
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -123        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 66          |
|    time_elapsed         | 746         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009306567 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.24       |
|    explained_variance   | 0.609       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.853       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.63

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 79.90810810810811
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -105        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 74          |
|    time_elapsed         | 838         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009766851 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.96       |
|    explained_variance   | 0.582       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.844       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 82.58455284552845
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -86         |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 82          |
|    time_elapsed         | 930         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009182626 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.57       |
|    explained_variance   | 0.48        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.839       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0545     |
|    value_loss           | 2.35

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 85.07037037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -65.8       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 90          |
|    time_elapsed         | 1020        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009395112 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.25       |
|    explained_variance   | 0.367       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 3.02

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 87.31428571428572
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -48.5       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 98          |
|    time_elapsed         | 1110        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009313986 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.09       |
|    explained_variance   | 0.397       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.654       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0532     |
|    value_loss           | 2.37

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 6.6
All assignments history: [1, 7, 8, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 100      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -138.0
Standard deviation of reward: 0.0
Average successful assignments: 14.633333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008716557 |
|    clip_fraction        | 0.0804      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.295      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.07        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -148.0
Standard deviation of reward: 0.0
Average successful assignments: 22.506666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 10          |
|    time_elapsed         | 109         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012297547 |
|    clip_fraction        | 0.254       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.111       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.09        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.048      |
|    value_loss           | 4

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 40.43333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -184         |
| time/                   |              |
|    fps                  | 89           |
|    iterations           | 18           |
|    time_elapsed         | 206          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0107231345 |
|    clip_fraction        | 0.193        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.44        |
|    explained_variance   | 0.533        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.93         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0508      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 50.05128205128205
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 26          |
|    time_elapsed         | 302         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010595167 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.708       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.754       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 3.49

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 56.50588235294118
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 34          |
|    time_elapsed         | 397         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009654767 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.734       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.324       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 3.14

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 61.6031746031746
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 496         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010712572 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.06       |
|    explained_variance   | 0.722       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.549       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0591     |
|    value_loss           | 2.89 

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 66.09466666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -151        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 593         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.011137875 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.67       |
|    explained_variance   | 0.563       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0574     |
|    value_loss           | 3.21

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 70.17931034482758
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 58          |
|    time_elapsed         | 691         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009081427 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.29       |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.686       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.79

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 74.13232323232323
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -104        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 66          |
|    time_elapsed         | 789         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010858055 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.89       |
|    explained_variance   | 0.45        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.778       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0568     |
|    value_loss           | 2.77

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 77.56846846846847
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -83.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 74          |
|    time_elapsed         | 887         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009289548 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.6        |
|    explained_variance   | 0.467       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.831       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0521     |
|    value_loss           | 2.55

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 80.70813008130081
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -64.4       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 82          |
|    time_elapsed         | 986         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009505877 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.38       |
|    explained_variance   | 0.506       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.653       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0534     |
|    value_loss           | 2.31

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 83.5125925925926
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -50.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 90          |
|    time_elapsed         | 1084        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009008711 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.14       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.52 

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 86.24489795918367
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -38.4       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1180        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009012379 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.98       |
|    explained_variance   | 0.483       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.878       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 2.35

done till here

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 9.066666666666666
All assignments history: [3, 8, 11, 8, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 90       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 13.333333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -184         |
| time/                   |              |
|    fps                  | 90           |
|    iterations           | 2            |
|    time_elapsed         | 22           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0073800776 |
|    clip_fraction        | 0.0617       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.5         |
|    explained_variance   | -0.249       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.67         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0448      |
|    value

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 34.71333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 10          |
|    time_elapsed         | 115         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011334265 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.114       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.58        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0459     |
|    value_loss           | 5.2

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 50.214814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 18          |
|    time_elapsed         | 213         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009967918 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.431       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0489     |
|    value_loss           | 4.3

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 58.91538461538462
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 26          |
|    time_elapsed         | 311         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010594491 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.627       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.67        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 3.5 

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 63.72549019607843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 34          |
|    time_elapsed         | 408         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009908002 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.26       |
|    explained_variance   | 0.695       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.581       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 3.33

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 67.26666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 42          |
|    time_elapsed         | 506         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008941755 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.02       |
|    explained_variance   | 0.691       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.24        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0524     |
|    value_loss           | 3.08

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 70.428
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -148        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 50          |
|    time_elapsed         | 604         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009046512 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.6        |
|    explained_variance   | 0.613       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.907       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 3.03        |
-

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 73.97931034482758
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 58          |
|    time_elapsed         | 703         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009368284 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.07       |
|    explained_variance   | 0.545       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0557     |
|    value_loss           | 2.82

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 77.54545454545455
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 66          |
|    time_elapsed         | 804         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009193518 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.588       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.676       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0562     |
|    value_loss           | 2.31

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 80.97747747747748
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -81.5       |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 74          |
|    time_elapsed         | 895         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009406432 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.58       |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.662       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 2.46

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 84.21869918699187
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -64.8       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 82          |
|    time_elapsed         | 985         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009118409 |
|    clip_fraction        | 0.18        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.456       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.631       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0512     |
|    value_loss           | 2.07

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 87.36074074074074
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -45.4      |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 90         |
|    time_elapsed         | 1076       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00967711 |
|    clip_fraction        | 0.212      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.95      |
|    explained_variance   | 0.438      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.641      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0527    |
|    value_loss           | 2.06       |
----------

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 90.11904761904762
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -28.2       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1173        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.010881467 |
|    clip_fraction        | 0.244       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.79       |
|    explained_variance   | 0.423       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.712       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.15

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.8
All assignments history: [5, 7, 5, 11, 9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 94       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -136.0
Standard deviation of reward: 0.0
Average successful assignments: 14.566666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007996151 |
|    clip_fraction        | 0.0728      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.32       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.23        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0456     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 41.79333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 10          |
|    time_elapsed         | 116         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012231614 |
|    clip_fraction        | 0.23        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.0741      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0464     |
|    value_loss           | 4.9

-------- Rollout Summary --------
Total mean reward: -36.0
Standard deviation of reward: 0.0
Average successful assignments: 50.262962962962966
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 18          |
|    time_elapsed         | 212         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.013269152 |
|    clip_fraction        | 0.25        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.528       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.07        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 55.13076923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 26          |
|    time_elapsed         | 307         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010331518 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.648       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0508     |
|    value_loss           | 3.78 

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 58.39607843137255
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 34          |
|    time_elapsed         | 403         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010147725 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.3        |
|    explained_variance   | 0.694       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.37        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 3.51

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 61.522222222222226
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 499         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009372295 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.14       |
|    explained_variance   | 0.686       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.4         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 3.49

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 64.18
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -155        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 50          |
|    time_elapsed         | 595         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009867021 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.77       |
|    explained_variance   | 0.698       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.671       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0552     |
|    value_loss           | 2.45        |
---

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 66.64367816091954
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -134         |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 58           |
|    time_elapsed         | 688          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0078002987 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.36        |
|    explained_variance   | 0.732        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.05         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0477      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 69.70707070707071
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -111        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 66          |
|    time_elapsed         | 779         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009455958 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.97       |
|    explained_variance   | 0.7         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.668       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.21

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 72.93063063063063
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -88          |
| time/                   |              |
|    fps                  | 87           |
|    iterations           | 74           |
|    time_elapsed         | 863          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0073694484 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.64        |
|    explained_variance   | 0.681        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.869        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0459      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 75.98943089430894
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -69.6       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 948         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008541344 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.42       |
|    explained_variance   | 0.696       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.607       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 2.05

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 79.23185185185186
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -52.6       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1032        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008175315 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.17       |
|    explained_variance   | 0.524       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.768       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.4 

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 82.54217687074829
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36.3       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 98          |
|    time_elapsed         | 1116        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008165136 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.581       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.925       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.24

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.533333333333333
All assignments history: [6, 9, 8, 6, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 112      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 9.566666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 105         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008538669 |
|    clip_fraction        | 0.0812      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.301      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.14        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0475     |
|    value_loss           | 18

-------- Rollout Summary --------
Total mean reward: -70.0
Standard deviation of reward: 0.0
Average successful assignments: 14.473333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 10          |
|    time_elapsed         | 103         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011088274 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.073       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0463     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -84.0
Standard deviation of reward: 0.0
Average successful assignments: 25.714814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 18          |
|    time_elapsed         | 184         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012463628 |
|    clip_fraction        | 0.263       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.508       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.84        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0557     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 35.89230769230769
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 102        |
|    iterations           | 26         |
|    time_elapsed         | 260        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00958986 |
|    clip_fraction        | 0.166      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.41      |
|    explained_variance   | 0.626      |
|    learning_rate        | 0.00018    |
|    loss                 | 2.38       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0492    |
|    value_loss           | 3.89       |
---------

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 44.8078431372549
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 34          |
|    time_elapsed         | 333         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011080688 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.72        |
|    learning_rate        | 0.00018     |
|    loss                 | 2.67        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0559     |
|    value_loss           | 3.09 

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 50.94285714285714
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 105         |
|    iterations           | 42          |
|    time_elapsed         | 406         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011134844 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.27       |
|    explained_variance   | 0.751       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.61        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0584     |
|    value_loss           | 2.87

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 55.82533333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 50          |
|    time_elapsed         | 479         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010781329 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.05       |
|    explained_variance   | 0.731       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0609     |
|    value_loss           | 2.53

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 60.60459770114942
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -150        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 58          |
|    time_elapsed         | 552         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010101575 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.73       |
|    explained_variance   | 0.702       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.332       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0576     |
|    value_loss           | 2.44

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 64.67979797979798
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -131        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 66          |
|    time_elapsed         | 623         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009146852 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.67        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.477       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0534     |
|    value_loss           | 2.52

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 68.57207207207207
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -109        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 74          |
|    time_elapsed         | 698         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008550471 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.94       |
|    explained_variance   | 0.639       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.821       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 2.49

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 72.2349593495935
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -84.5       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 82          |
|    time_elapsed         | 768         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008796427 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.51       |
|    explained_variance   | 0.585       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.571       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0504     |
|    value_loss           | 2.13 

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 75.85777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -58.9       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 90          |
|    time_elapsed         | 842         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008815294 |
|    clip_fraction        | 0.187       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.09       |
|    explained_variance   | 0.429       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.927       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0503     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 79.30136054421769
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36.4       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 98          |
|    time_elapsed         | 917         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008985432 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.469       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.634       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0489     |
|    value_loss           | 2.04

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 6.8
All assignments history: [8, 7, 4, 5, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 138      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -162.0
Standard deviation of reward: 0.0
Average successful assignments: 10.866666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008140134 |
|    clip_fraction        | 0.0732      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.5        |
|    explained_variance   | -0.241      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.21        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -38.0
Standard deviation of reward: 0.0
Average successful assignments: 31.16
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 10          |
|    time_elapsed         | 92          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011381704 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.0778      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.601       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0456     |
|    value_loss           | 5.71        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 44.3962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 18          |
|    time_elapsed         | 182         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011945326 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.516       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.792       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 3.97  

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 52.54102564102564
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 106          |
|    iterations           | 26           |
|    time_elapsed         | 250          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0110857785 |
|    clip_fraction        | 0.22         |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.41        |
|    explained_variance   | 0.689        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.98         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0571      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 57.23137254901961
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 105         |
|    iterations           | 34          |
|    time_elapsed         | 328         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011042632 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.33       |
|    explained_variance   | 0.723       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.633       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 3.24

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 60.84444444444444
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -170        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 42          |
|    time_elapsed         | 403         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009106379 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.14       |
|    explained_variance   | 0.755       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 2.59

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 64.128
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -157        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 50          |
|    time_elapsed         | 476         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009162295 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.82       |
|    explained_variance   | 0.7         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.647       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 2.63        |
-

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 67.08505747126436
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -139         |
| time/                   |              |
|    fps                  | 108          |
|    iterations           | 58           |
|    time_elapsed         | 545          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0090683745 |
|    clip_fraction        | 0.165        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.47        |
|    explained_variance   | 0.689        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.669        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0558      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 69.98484848484848
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -121        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 66          |
|    time_elapsed         | 613         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009422033 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.12       |
|    explained_variance   | 0.694       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.458       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.057      |
|    value_loss           | 2.04

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 73.05405405405405
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 74          |
|    time_elapsed         | 680         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007833806 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.84       |
|    explained_variance   | 0.697       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.334       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0491     |
|    value_loss           | 1.85

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 76.04715447154472
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -82.9       |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 82          |
|    time_elapsed         | 748         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008732684 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.54       |
|    explained_variance   | 0.709       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.735       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.047      |
|    value_loss           | 2.31

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 78.94296296296297
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -65.2       |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 90          |
|    time_elapsed         | 817         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009260284 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.37       |
|    explained_variance   | 0.717       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.716       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0516     |
|    value_loss           | 2.22

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 81.67619047619047
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -50.9        |
| time/                   |              |
|    fps                  | 113          |
|    iterations           | 98           |
|    time_elapsed         | 885          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0084003685 |
|    clip_fraction        | 0.176        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.14        |
|    explained_variance   | 0.642        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.664        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0509      |
|    value_lo

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 8.466666666666667
All assignments history: [7, 9, 7, 1, 3, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -189     |
| time/              |          |
|    fps             | 119      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -168.0
Standard deviation of reward: 0.0
Average successful assignments: 10.7
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 117          |
|    iterations           | 2            |
|    time_elapsed         | 17           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0076514324 |
|    clip_fraction        | 0.0658       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.5         |
|    explained_variance   | -0.112       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.43         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0457      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 33.193333333333335
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -185         |
| time/                   |              |
|    fps                  | 117          |
|    iterations           | 10           |
|    time_elapsed         | 86           |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0124403825 |
|    clip_fraction        | 0.228        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.46        |
|    explained_variance   | 0.105        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.319        |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0497      |
|    value_

-------- Rollout Summary --------
Total mean reward: -16.0
Standard deviation of reward: 0.0
Average successful assignments: 45.34814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 18          |
|    time_elapsed         | 153         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012602808 |
|    clip_fraction        | 0.26        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.583       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 53.1
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 26          |
|    time_elapsed         | 219         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.012125655 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.666       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.967       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0559     |
|    value_loss           | 3.53        |
---

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 58.06666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 34          |
|    time_elapsed         | 286         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009169591 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.23       |
|    explained_variance   | 0.678       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.31        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0518     |
|    value_loss           | 3.37

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 61.86666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 42          |
|    time_elapsed         | 352         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009157475 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.94       |
|    explained_variance   | 0.67        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0558     |
|    value_loss           | 2.94

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 65.81066666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -146        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 50          |
|    time_elapsed         | 418         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.011574819 |
|    clip_fraction        | 0.237       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.66       |
|    explained_variance   | 0.604       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.595       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0608     |
|    value_loss           | 2.91

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 69.38735632183908
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -130        |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 58          |
|    time_elapsed         | 481         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009117387 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.596       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.964       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.71

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 73.55858585858586
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -112         |
| time/                   |              |
|    fps                  | 124          |
|    iterations           | 66           |
|    time_elapsed         | 540          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0073810006 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.07        |
|    explained_variance   | 0.59         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.586        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0491      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 76.90900900900901
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -95.8       |
| time/                   |             |
|    fps                  | 127         |
|    iterations           | 74          |
|    time_elapsed         | 595         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008902332 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.8        |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.776       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0548     |
|    value_loss           | 2.48

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 80.4130081300813
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -79.1       |
| time/                   |             |
|    fps                  | 130         |
|    iterations           | 82          |
|    time_elapsed         | 642         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008481371 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.55       |
|    explained_variance   | 0.579       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.952       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0518     |
|    value_loss           | 2.56 

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 83.8362962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -59.5       |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 90          |
|    time_elapsed         | 675         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009223225 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.555       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.664       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 2.28 

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 86.87278911564626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -41.7       |
| time/                   |             |
|    fps                  | 141         |
|    iterations           | 98          |
|    time_elapsed         | 708         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008167173 |
|    clip_fraction        | 0.147       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.53        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.699       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0447     |
|    value_loss           | 2.19