In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_ridge_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    ridge_model = Ridge()
    ridge_model.fit(X_train_scaled, y_train)
    return ridge_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
ridge_model, scaler = train_ridge_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = ridge_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.09766677979988178
RMSE: 0.12209370048784808
R-squared: 0.9998991762339906
RAE: 0.010200341696916998


In [2]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -198.0
Standard deviation of reward: 0.0
Average successful assignments: 2.533333333333333
All assignments history: [8, 4, 3, 7, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -189     |
| time/              |          |
|    fps             | 117      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 20.466666666666665
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007828204 |
|    clip_fraction        | 0.0667      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.364      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.2         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 19

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 30.30666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 10          |
|    time_elapsed         | 114         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011504343 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.168       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.924       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 4.8

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 45.492592592592594
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 18          |
|    time_elapsed         | 211         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012877332 |
|    clip_fraction        | 0.266       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.588       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.03        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.058      |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 53.823076923076925
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 26          |
|    time_elapsed         | 311         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009556917 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.752       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.0272      |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0531     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 59.62352941176471
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 34          |
|    time_elapsed         | 413         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011236543 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.804       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.361       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0566     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 64.4984126984127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -168        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 42          |
|    time_elapsed         | 516         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011183668 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.08       |
|    explained_variance   | 0.803       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.399       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0609     |
|    value_loss           | 2.17 

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 68.99466666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -154        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 50          |
|    time_elapsed         | 615         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010046101 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.66       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.835       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.06       |
|    value_loss           | 2.29

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 73.2977011494253
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -131        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 58          |
|    time_elapsed         | 720         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009092037 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.17       |
|    explained_variance   | 0.727       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.431       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0573     |
|    value_loss           | 1.75 

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 77.52525252525253
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -105         |
| time/                   |              |
|    fps                  | 81           |
|    iterations           | 66           |
|    time_elapsed         | 824          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0103666745 |
|    clip_fraction        | 0.201        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.77        |
|    explained_variance   | 0.656        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.617        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0566      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 81.31531531531532
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -77.7       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 74          |
|    time_elapsed         | 928         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010081214 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.38       |
|    explained_variance   | 0.536       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.965       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.6 

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 84.75691056910568
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -56.5       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 82          |
|    time_elapsed         | 1032        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009384202 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.17       |
|    explained_variance   | 0.469       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.734       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0532     |
|    value_loss           | 2.6 

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 87.70296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -42.2       |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 90          |
|    time_elapsed         | 1149        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008839945 |
|    clip_fraction        | 0.173       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.11       |
|    explained_variance   | 0.608       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.838       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0466     |
|    value_loss           | 2.45

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 90.3
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -33.9        |
| time/                   |              |
|    fps                  | 79           |
|    iterations           | 98           |
|    time_elapsed         | 1261         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0085407365 |
|    clip_fraction        | 0.174        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.09        |
|    explained_variance   | 0.652        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.603        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0492      |
|    value_loss           

In [3]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.6
All assignments history: [6, 6, 8, 7, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 80       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -164.0
Standard deviation of reward: 0.0
Average successful assignments: 9.533333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008059041 |
|    clip_fraction        | 0.0703      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.11       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.78        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0446     |
|    value_loss           | 16

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 31.36
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -187       |
| time/                   |            |
|    fps                  | 74         |
|    iterations           | 10         |
|    time_elapsed         | 136        |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01176564 |
|    clip_fraction        | 0.197      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.46      |
|    explained_variance   | 0.15       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.18       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0477    |
|    value_loss           | 5.22       |
---------------------

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 47.81111111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -185         |
| time/                   |              |
|    fps                  | 74           |
|    iterations           | 18           |
|    time_elapsed         | 245          |
|    total_timesteps      | 18432        |
| train/                  |              |
|    approx_kl            | 0.0108119575 |
|    clip_fraction        | 0.18         |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.43        |
|    explained_variance   | 0.517        |
|    learning_rate        | 0.00018      |
|    loss                 | 2.85         |
|    n_updates            | 170          |
|    policy_gradient_loss | -0.0507      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 54.96153846153846
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 26          |
|    time_elapsed         | 355         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.011708455 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.651       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.39        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 3.53 

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 59.3235294117647
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 34          |
|    time_elapsed         | 465         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009379253 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.24       |
|    explained_variance   | 0.749       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.055      |
|    value_loss           | 2.87 

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 63.34761904761905
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -166        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 42          |
|    time_elapsed         | 571         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010277458 |
|    clip_fraction        | 0.193       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.96       |
|    explained_variance   | 0.787       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.47        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0577     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 68.12266666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -145        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 50          |
|    time_elapsed         | 677         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009373242 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.42       |
|    explained_variance   | 0.755       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.834       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 72.86551724137931
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -119        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 58          |
|    time_elapsed         | 779         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009629726 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.01       |
|    explained_variance   | 0.746       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.672       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 77.15454545454546
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -92.7      |
| time/                   |            |
|    fps                  | 76         |
|    iterations           | 66         |
|    time_elapsed         | 878        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.01012799 |
|    clip_fraction        | 0.21       |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.64      |
|    explained_variance   | 0.632      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.652      |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0573    |
|    value_loss           | 2.46       |
----------

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 81.02072072072072
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -70.5       |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 74          |
|    time_elapsed         | 978         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009599419 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.548       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.843       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0513     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 84.54227642276423
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -50.6       |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 82          |
|    time_elapsed         | 1079        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008417375 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.07       |
|    explained_variance   | 0.552       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.926       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 3.03

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 87.58518518518518
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -36        |
| time/                   |            |
|    fps                  | 78         |
|    iterations           | 90         |
|    time_elapsed         | 1177       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00951835 |
|    clip_fraction        | 0.194      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.04      |
|    explained_variance   | 0.685      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.875      |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0511    |
|    value_loss           | 2.49       |
----------

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 90.17551020408163
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -29.2      |
| time/                   |            |
|    fps                  | 78         |
|    iterations           | 98         |
|    time_elapsed         | 1276       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00820351 |
|    clip_fraction        | 0.187      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.99      |
|    explained_variance   | 0.644      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.837      |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.053     |
|    value_loss           | 2.33       |
----------

In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.333333333333333
All assignments history: [7, 5, 3, 8, 7, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 95       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -162.0
Standard deviation of reward: 0.0
Average successful assignments: 10.033333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -188       |
| time/                   |            |
|    fps                  | 89         |
|    iterations           | 2          |
|    time_elapsed         | 22         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00725755 |
|    clip_fraction        | 0.0522     |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.49      |
|    explained_variance   | -0.173     |
|    learning_rate        | 0.00018    |
|    loss                 | 3.25       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0437    |
|    value_loss           | 18.7       |
-------

-------- Rollout Summary --------
Total mean reward: -58.0
Standard deviation of reward: 0.0
Average successful assignments: 25.206666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 10          |
|    time_elapsed         | 119         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012106564 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.165       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.22        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0478     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -26.0
Standard deviation of reward: 0.0
Average successful assignments: 38.80740740740741
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 18          |
|    time_elapsed         | 214         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011404935 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.484       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.92        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 4.2

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 47.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 26          |
|    time_elapsed         | 312         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009033908 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.36       |
|    explained_variance   | 0.65        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0512     |
|    value_loss           | 3.47        |
---

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 53.4156862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 34          |
|    time_elapsed         | 406         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010307284 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.23       |
|    explained_variance   | 0.704       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.893       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.054      |
|    value_loss           | 3.14 

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 58.6
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -162        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 497         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009878598 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.99       |
|    explained_variance   | 0.729       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.658       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.68        |
---

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 64.036
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -144        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 50          |
|    time_elapsed         | 587         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009659826 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.47       |
|    explained_variance   | 0.688       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.506       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.36        |
-

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 68.91264367816092
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -120        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 58          |
|    time_elapsed         | 677         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009478593 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.11       |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.634       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 2.09

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 73.41010101010102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -96.9       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 66          |
|    time_elapsed         | 765         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010806706 |
|    clip_fraction        | 0.212       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.78       |
|    explained_variance   | 0.622       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.484       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0548     |
|    value_loss           | 1.92

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 77.94594594594595
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -77         |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 74          |
|    time_elapsed         | 855         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009139296 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.654       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.645       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0535     |
|    value_loss           | 1.99

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 82.14715447154471
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -59.6       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 82          |
|    time_elapsed         | 943         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009092949 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.16       |
|    explained_variance   | 0.568       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.677       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0557     |
|    value_loss           | 2.34

-------- Rollout Summary --------
Total mean reward: 92.0
Standard deviation of reward: 0.0
Average successful assignments: 86.20962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -39.3       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1034        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008193877 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.89       |
|    explained_variance   | 0.638       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.959       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 2.51

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 89.6482993197279
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -24.5       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 98          |
|    time_elapsed         | 1128        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007491453 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.81       |
|    explained_variance   | 0.728       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0442     |
|    value_loss           | 2.76 

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -178.0
Standard deviation of reward: 0.0
Average successful assignments: 9.266666666666667
All assignments history: [6, 6, 5, 6, 6, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 104      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -148.0
Standard deviation of reward: 0.0
Average successful assignments: 14.333333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007540983 |
|    clip_fraction        | 0.0673      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.229      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.77        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0441     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 35.11333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 10          |
|    time_elapsed         | 109         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011053174 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.0805      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0445     |
|    value_loss           | 5.5

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 49.53703703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 18          |
|    time_elapsed         | 198         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011906799 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.49        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.954       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 4.09 

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 56.13076923076923
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 26          |
|    time_elapsed         | 285         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009941991 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.59        |
|    learning_rate        | 0.00018     |
|    loss                 | 2.67        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 4.06 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 60.30980392156863
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 34          |
|    time_elapsed         | 372         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009068547 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.25       |
|    explained_variance   | 0.691       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0509     |
|    value_loss           | 3.35

-------- Rollout Summary --------
Total mean reward: 32.0
Standard deviation of reward: 0.0
Average successful assignments: 64.4
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 42          |
|    time_elapsed         | 463         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009362927 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.97       |
|    explained_variance   | 0.775       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.63        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.54        |
---

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 68.49466666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -147        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 50          |
|    time_elapsed         | 554         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007922277 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.51       |
|    explained_variance   | 0.771       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.463       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0508     |
|    value_loss           | 2.15

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 72.76666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -124        |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 58          |
|    time_elapsed         | 644         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008080682 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.04       |
|    explained_variance   | 0.694       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.784       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0506     |
|    value_loss           | 2.45

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 77.01313131313131
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -98.8       |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 66          |
|    time_elapsed         | 734         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010107532 |
|    clip_fraction        | 0.221       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.67       |
|    explained_variance   | 0.618       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.652       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0553     |
|    value_loss           | 2.16

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 80.93873873873873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -74.6       |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 74          |
|    time_elapsed         | 826         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010634901 |
|    clip_fraction        | 0.226       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.391       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.554       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0573     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 84.74634146341464
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -52.3       |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 82          |
|    time_elapsed         | 917         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009432863 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4          |
|    explained_variance   | 0.359       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.655       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 2.3 

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 88.10666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -34.6       |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 90          |
|    time_elapsed         | 1006        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009254251 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.86       |
|    explained_variance   | 0.368       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.15        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0551     |
|    value_loss           | 3.09

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 91.11836734693877
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -21.1       |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 98          |
|    time_elapsed         | 1096        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009444186 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.86       |
|    explained_variance   | 0.452       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.731       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 2.71

logged results till here

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.2
All assignments history: [10, 6, 5, 7, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 97       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 10.933333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008128035 |
|    clip_fraction        | 0.0669      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.0301     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.94        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0457     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 25.2
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 10          |
|    time_elapsed         | 108         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012687614 |
|    clip_fraction        | 0.224       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.131       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.56        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0488     |
|    value_loss           | 4.98        |
----

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 48.629629629629626
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 18          |
|    time_elapsed         | 203         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011925148 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.547       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0505     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 59.64358974358974
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 26          |
|    time_elapsed         | 301         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009566825 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.38       |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.6         |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 3.43

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 64.75098039215686
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -177       |
| time/                   |            |
|    fps                  | 88         |
|    iterations           | 34         |
|    time_elapsed         | 395        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01112359 |
|    clip_fraction        | 0.213      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.3       |
|    explained_variance   | 0.742      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.06       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0573    |
|    value_loss           | 2.95       |
----------

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 68.91111111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 493         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009039199 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.13       |
|    explained_variance   | 0.755       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0531     |
|    value_loss           | 2.63

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 73.092
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -155        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 590         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009545028 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | 0.705       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.826       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.056      |
|    value_loss           | 2.56        |
-

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 77.11379310344827
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -132        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 688         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008484132 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.23       |
|    explained_variance   | 0.579       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.857       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.8 

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 81.02222222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -102        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 66          |
|    time_elapsed         | 784         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007849868 |
|    clip_fraction        | 0.157       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.63       |
|    explained_variance   | 0.443       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.958       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0502     |
|    value_loss           | 2.99

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 84.46936936936937
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -73.3       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 74          |
|    time_elapsed         | 879         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007859953 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.3        |
|    explained_variance   | 0.452       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.13        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.048      |
|    value_loss           | 2.73

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 87.61463414634146
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -49.3      |
| time/                   |            |
|    fps                  | 86         |
|    iterations           | 82         |
|    time_elapsed         | 975        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00909327 |
|    clip_fraction        | 0.157      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.08      |
|    explained_variance   | 0.513      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.881      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0461    |
|    value_loss           | 2.6        |
----------

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 90.36
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -35.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 90          |
|    time_elapsed         | 1072        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009091752 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.01       |
|    explained_variance   | 0.527       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.936       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.52        |
--

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 92.78503401360544
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -27         |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1168        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008310042 |
|    clip_fraction        | 0.177       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.94       |
|    explained_variance   | 0.568       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0481     |
|    value_loss           | 2.54

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.066666666666666
All assignments history: [4, 13, 8, 8, 3, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 98       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -162.0
Standard deviation of reward: 0.0
Average successful assignments: 10.966666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 96          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008317416 |
|    clip_fraction        | 0.0744      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | 0.00284     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.67        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -74.0
Standard deviation of reward: 0.0
Average successful assignments: 31.92
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 10          |
|    time_elapsed         | 111         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011161733 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.126       |
|    learning_rate        | 0.00018     |
|    loss                 | 3.39        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 5.33        |
-

-------- Rollout Summary --------
Total mean reward: -40.0
Standard deviation of reward: 0.0
Average successful assignments: 44.13703703703704
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 18          |
|    time_elapsed         | 207         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012760425 |
|    clip_fraction        | 0.258       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.54        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.343       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 49.44102564102564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 26          |
|    time_elapsed         | 304         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010520361 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.34       |
|    explained_variance   | 0.666       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.417       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0529     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 53.6078431372549
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 34          |
|    time_elapsed         | 402         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.010713767 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.2        |
|    explained_variance   | 0.73        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.991       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 3.04 

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 57.74603174603175
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -161        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 42          |
|    time_elapsed         | 500         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010357188 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.9        |
|    explained_variance   | 0.75        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.657       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0578     |
|    value_loss           | 2.64

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 62.55866666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -143        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 50          |
|    time_elapsed         | 598         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009780299 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.74        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.76        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0577     |
|    value_loss           | 2.72

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 67.25287356321839
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -120        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 58          |
|    time_elapsed         | 693         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009830829 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.14       |
|    explained_variance   | 0.72        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.922       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0553     |
|    value_loss           | 2.47

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 71.8949494949495
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -101        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 66          |
|    time_elapsed         | 791         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009978566 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.83       |
|    explained_variance   | 0.755       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.659       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 1.96 

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 76.11441441441441
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -83.3       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 74          |
|    time_elapsed         | 886         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008919889 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.64       |
|    explained_variance   | 0.656       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.836       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 2.11

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 80.0040650406504
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -67.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 82          |
|    time_elapsed         | 976         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008988391 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.39       |
|    explained_variance   | 0.733       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.449       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0531     |
|    value_loss           | 1.95 

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 83.46666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -53.2       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 90          |
|    time_elapsed         | 1066        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008500714 |
|    clip_fraction        | 0.159       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.12       |
|    explained_variance   | 0.665       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.578       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0464     |
|    value_loss           | 2.03

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 86.62789115646258
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -37.4        |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 98           |
|    time_elapsed         | 1158         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0071731294 |
|    clip_fraction        | 0.165        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.73        |
|    explained_variance   | 0.639        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.03         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0494      |
|    value_lo

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -196.0
Standard deviation of reward: 0.0
Average successful assignments: 3.7333333333333334
All assignments history: [9, 8, 7, 5, 7, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 95       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -166.0
Standard deviation of reward: 0.0
Average successful assignments: 8.666666666666666
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -186       |
| time/                   |            |
|    fps                  | 90         |
|    iterations           | 2          |
|    time_elapsed         | 22         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00801836 |
|    clip_fraction        | 0.0675     |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.49      |
|    explained_variance   | -0.256     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.64       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0437    |
|    value_loss           | 15.8       |
--------

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 44.11333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 10          |
|    time_elapsed         | 115         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012365245 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.0467      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.67        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0474     |
|    value_loss           | 5.2

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 60.096296296296295
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 18          |
|    time_elapsed         | 211         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012354893 |
|    clip_fraction        | 0.248       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.422       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.15        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 4.4

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 68.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 26          |
|    time_elapsed         | 306         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010848236 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.36       |
|    explained_variance   | 0.606       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 3.86        |
---

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 72.80392156862744
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -175         |
| time/                   |              |
|    fps                  | 86           |
|    iterations           | 34           |
|    time_elapsed         | 401          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0118565615 |
|    clip_fraction        | 0.232        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.24        |
|    explained_variance   | 0.693        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.694        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.057       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 75.86031746031746
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -164        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 496         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008879466 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.96       |
|    explained_variance   | 0.73        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.877       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 3.03

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 78.76933333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -146        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 591         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008102084 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.47       |
|    explained_variance   | 0.762       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.58        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 82.01609195402298
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -119        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 686         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008576469 |
|    clip_fraction        | 0.171       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.02       |
|    explained_variance   | 0.78        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.868       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0544     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 85.15050505050505
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -91.2       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 66          |
|    time_elapsed         | 776         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007937777 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.53       |
|    explained_variance   | 0.769       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.813       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 88.15495495495496
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -67.7        |
| time/                   |              |
|    fps                  | 87           |
|    iterations           | 74           |
|    time_elapsed         | 867          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0073818066 |
|    clip_fraction        | 0.155        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.33        |
|    explained_variance   | 0.795        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.759        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.048       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 90.79186991869919
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -51.3       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 82          |
|    time_elapsed         | 952         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008975535 |
|    clip_fraction        | 0.209       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.1        |
|    explained_variance   | 0.753       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.596       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 2.35

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 93.11481481481482
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -39.4       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1035        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009114273 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.02       |
|    explained_variance   | 0.668       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.587       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.36

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 95.16122448979591
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -32.5       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 98          |
|    time_elapsed         | 1119        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008255782 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.97       |
|    explained_variance   | 0.615       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.627       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.35

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.866666666666666
All assignments history: [9, 9, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 111      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -184.0
Standard deviation of reward: 0.0
Average successful assignments: 6.1
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 103         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008388348 |
|    clip_fraction        | 0.0825      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.297      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.48        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0446     |
|    value_loss           | 16.7        |
--

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 23.06
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 10          |
|    time_elapsed         | 102         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011437462 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.0716      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 5.36        |
--

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 45.074074074074076
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 99          |
|    iterations           | 18          |
|    time_elapsed         | 185         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011029473 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.85        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 3.9

-------- Rollout Summary --------
Total mean reward: 20.0
Standard deviation of reward: 0.0
Average successful assignments: 55.33846153846154
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 26          |
|    time_elapsed         | 265         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008196884 |
|    clip_fraction        | 0.142       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.636       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.442       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0475     |
|    value_loss           | 3.76

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 60.8
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 102         |
|    iterations           | 34          |
|    time_elapsed         | 339         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009000547 |
|    clip_fraction        | 0.154       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.735       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.918       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 3.03        |
---

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 65.53015873015873
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 42          |
|    time_elapsed         | 410         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009316182 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.08       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.591       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 3   

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 69.616
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -152        |
| time/                   |             |
|    fps                  | 105         |
|    iterations           | 50          |
|    time_elapsed         | 483         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010390209 |
|    clip_fraction        | 0.224       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.61       |
|    explained_variance   | 0.76        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.82        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0604     |
|    value_loss           | 2.59        |
-

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 73.99655172413793
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -127        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 58          |
|    time_elapsed         | 554         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009107358 |
|    clip_fraction        | 0.185       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.06       |
|    explained_variance   | 0.61        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.16        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.93

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 78.34444444444445
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -94.7       |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 66          |
|    time_elapsed         | 626         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009008916 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.53       |
|    explained_variance   | 0.532       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.803       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0513     |
|    value_loss           | 2.6 

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 82.23333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -64.2      |
| time/                   |            |
|    fps                  | 108        |
|    iterations           | 74         |
|    time_elapsed         | 699        |
|    total_timesteps      | 75776      |
| train/                  |            |
|    approx_kl            | 0.00910388 |
|    clip_fraction        | 0.175      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.25      |
|    explained_variance   | 0.485      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.22       |
|    n_updates            | 730        |
|    policy_gradient_loss | -0.048     |
|    value_loss           | 2.77       |
----------

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 85.67073170731707
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -44          |
| time/                   |              |
|    fps                  | 108          |
|    iterations           | 82           |
|    time_elapsed         | 773          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0087601645 |
|    clip_fraction        | 0.179        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.13        |
|    explained_variance   | 0.509        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.858        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0504      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 88.71629629629629
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -29.8        |
| time/                   |              |
|    fps                  | 109          |
|    iterations           | 90           |
|    time_elapsed         | 843          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0076293075 |
|    clip_fraction        | 0.145        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.9         |
|    explained_variance   | 0.574        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.819        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0444      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 91.41768707482993
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -19.8       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 98          |
|    time_elapsed         | 917         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009290062 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.78       |
|    explained_variance   | 0.517       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.59        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 2.27

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -174.0
Standard deviation of reward: 0.0
Average successful assignments: 11.466666666666667
All assignments history: [13, 7, 8, 5, 9, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -183     |
| time/              |          |
|    fps             | 118      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 14.366666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 116         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008145453 |
|    clip_fraction        | 0.071       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.49       |
|    explained_variance   | -0.0108     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.54        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0459     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 44.413333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 115         |
|    iterations           | 10          |
|    time_elapsed         | 88          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011829655 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.118       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0464     |
|    value_loss           | 5.32

-------- Rollout Summary --------
Total mean reward: -14.0
Standard deviation of reward: 0.0
Average successful assignments: 55.39259259259259
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 18          |
|    time_elapsed         | 160         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011907276 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.582       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.7         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0546     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 59.28974358974359
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 26          |
|    time_elapsed         | 235         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010231253 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.35       |
|    explained_variance   | 0.785       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.579       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 2.53 

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 62.51960784313726
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 34          |
|    time_elapsed         | 324         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008938653 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.21       |
|    explained_variance   | 0.817       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.635       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0512     |
|    value_loss           | 2.35 

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 66.02698412698413
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 42          |
|    time_elapsed         | 394         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009524194 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.85       |
|    explained_variance   | 0.834       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.65        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0576     |
|    value_loss           | 2.05

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 69.81733333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -141        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 50          |
|    time_elapsed         | 473         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.011239752 |
|    clip_fraction        | 0.226       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.38       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.463       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0594     |
|    value_loss           | 2.24

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 73.3287356321839
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -118        |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 58          |
|    time_elapsed         | 547         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009057376 |
|    clip_fraction        | 0.198       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.724       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.756       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0565     |
|    value_loss           | 2.26 

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 77.13434343434344
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -96.3       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 66          |
|    time_elapsed         | 619         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010050236 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.73       |
|    explained_variance   | 0.643       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.884       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 2.24

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 80.82972972972973
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -79.6       |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 74          |
|    time_elapsed         | 688         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008360887 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.44       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.637       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0498     |
|    value_loss           | 2.45

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 84.27804878048781
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -64.3       |
| time/                   |             |
|    fps                  | 111         |
|    iterations           | 82          |
|    time_elapsed         | 755         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.008150786 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.17       |
|    explained_variance   | 0.558       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.584       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 2.38

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 87.34296296296296
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -47.6       |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 90          |
|    time_elapsed         | 821         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008310009 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.91       |
|    explained_variance   | 0.618       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.475       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 1.81

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 90.06802721088435
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -31.8       |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 98          |
|    time_elapsed         | 888         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008037605 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.83       |
|    explained_variance   | 0.587       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.799       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0434     |
|    value_loss           | 2.17

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -174.0
Standard deviation of reward: 0.0
Average successful assignments: 10.733333333333333
All assignments history: [6, 6, 7, 3, 9, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 138      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -156.0
Standard deviation of reward: 0.0
Average successful assignments: 13.633333333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -188         |
| time/                   |              |
|    fps                  | 130          |
|    iterations           | 2            |
|    time_elapsed         | 15           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0081339795 |
|    clip_fraction        | 0.072        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.49        |
|    explained_variance   | -0.265       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.37         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0449      |
|    value

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 36.86
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 10          |
|    time_elapsed         | 82          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011615112 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.176       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.48        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 4.95        |
---

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 51.87777777777778
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 18          |
|    time_elapsed         | 153         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010802373 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.555       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.552       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 4.31 

-------- Rollout Summary --------
Total mean reward: 0.0
Standard deviation of reward: 0.0
Average successful assignments: 57.82051282051282
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 119         |
|    iterations           | 26          |
|    time_elapsed         | 221         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010667693 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.725       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.61        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.055      |
|    value_loss           | 3.24 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 62.286274509803924
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 120         |
|    iterations           | 34          |
|    time_elapsed         | 287         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009136254 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.27       |
|    explained_variance   | 0.811       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.62        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 65.63333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 42          |
|    time_elapsed         | 353         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009113197 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.98       |
|    explained_variance   | 0.802       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0531     |
|    value_loss           | 2.51

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 69.34
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -146         |
| time/                   |              |
|    fps                  | 121          |
|    iterations           | 50           |
|    time_elapsed         | 419          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0096549075 |
|    clip_fraction        | 0.186        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.49        |
|    explained_variance   | 0.798        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.67         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0588      |
|    value_loss          

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 73.81494252873563
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -119        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 58          |
|    time_elapsed         | 486         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008986178 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.773       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.84        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.46

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 78.0111111111111
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -93          |
| time/                   |              |
|    fps                  | 122          |
|    iterations           | 66           |
|    time_elapsed         | 551          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0110922605 |
|    clip_fraction        | 0.223        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.67        |
|    explained_variance   | 0.678        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.819        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.058       |
|    value_los

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 82.1018018018018
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -69.7       |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 74          |
|    time_elapsed         | 614         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009634249 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.34       |
|    explained_variance   | 0.705       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.79        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0568     |
|    value_loss           | 2.25 

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 85.80813008130082
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -51.8       |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 82          |
|    time_elapsed         | 673         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009782599 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.16       |
|    explained_variance   | 0.559       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.891       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0555     |
|    value_loss           | 2.36

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 89.17407407407407
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36.2       |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 90          |
|    time_elapsed         | 726         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.010381451 |
|    clip_fraction        | 0.228       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.94       |
|    explained_variance   | 0.62        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.758       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 2.51

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 91.99319727891157
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -24.2       |
| time/                   |             |
|    fps                  | 129         |
|    iterations           | 98          |
|    time_elapsed         | 773         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.010410497 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4          |
|    explained_variance   | 0.495       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.917       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0517     |
|    value_loss           | 2.51