In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def train_svr_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    svr_model = SVR()
    svr_model.fit(X_train_scaled, y_train)
    return svr_model, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
svr_model, scaler = train_svr_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = svr_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']  # This would be prior to overwriting with predictions if you run this block again

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


MAE: 0.6709803969580043
RMSE: 1.1802692335254894
R-squared: 0.9905780981775942
RAE: 0.07007735214500166


In [4]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -198.0
Standard deviation of reward: 0.0
Average successful assignments: 3.1333333333333333
All assignments history: [7, 9, 4, 11, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -185     |
| time/              |          |
|    fps             | 94       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -154.0
Standard deviation of reward: 0.0
Average successful assignments: 10.2
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -187         |
| time/                   |              |
|    fps                  | 89           |
|    iterations           | 2            |
|    time_elapsed         | 22           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0077732317 |
|    clip_fraction        | 0.0632       |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.51        |
|    explained_variance   | -0.213       |
|    learning_rate        | 0.00018      |
|    loss                 | 3.1          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0452      |
|    value_loss         

-------- Rollout Summary --------
Total mean reward: -132.0
Standard deviation of reward: 0.0
Average successful assignments: 22.086666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 10          |
|    time_elapsed         | 118         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011641461 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.0727      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.52        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -68.0
Standard deviation of reward: 0.0
Average successful assignments: 31.28148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 18          |
|    time_elapsed         | 215         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011211335 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.52        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.632       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0497     |
|    value_loss           | 3.9

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 38.751282051282054
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 26          |
|    time_elapsed         | 316         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010268916 |
|    clip_fraction        | 0.182       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.663       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.546       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.054      |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 47.90196078431372
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -180       |
| time/                   |            |
|    fps                  | 83         |
|    iterations           | 34         |
|    time_elapsed         | 419        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.01081061 |
|    clip_fraction        | 0.185      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.35      |
|    explained_variance   | 0.706      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.19       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0525    |
|    value_loss           | 3.14       |
----------

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 54.36825396825397
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -173        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 42          |
|    time_elapsed         | 517         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011124025 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.24       |
|    explained_variance   | 0.697       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.05        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0564     |
|    value_loss           | 3.07

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 59.54266666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 50          |
|    time_elapsed         | 616         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009435546 |
|    clip_fraction        | 0.175       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.03       |
|    explained_variance   | 0.658       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0553     |
|    value_loss           | 3.26

-------- Rollout Summary --------
Total mean reward: 42.0
Standard deviation of reward: 0.0
Average successful assignments: 63.63103448275862
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -150        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 58          |
|    time_elapsed         | 719         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.011121536 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.649       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.572       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0584     |
|    value_loss           | 2.87

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 67.47575757575757
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -132        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 66          |
|    time_elapsed         | 821         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008798251 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.652       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.47        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.053      |
|    value_loss           | 2.87

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 71.02972972972972
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -112        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 74          |
|    time_elapsed         | 922         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009921119 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.505       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.01        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0598     |
|    value_loss           | 3.16

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 74.54471544715447
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -90.2       |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 82          |
|    time_elapsed         | 1029        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009132472 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.59       |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.834       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0547     |
|    value_loss           | 2.76

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 77.82148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -69.8       |
| time/                   |             |
|    fps                  | 80          |
|    iterations           | 90          |
|    time_elapsed         | 1141        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008268278 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.32       |
|    explained_variance   | 0.464       |
|    learning_rate        | 0.00018     |
|    loss                 | 1           |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0511     |
|    value_loss           | 3.05

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 80.8843537414966
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -52.4       |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 98          |
|    time_elapsed         | 1254        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009497426 |
|    clip_fraction        | 0.186       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.18       |
|    explained_variance   | 0.465       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.825       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0504     |
|    value_loss           | 2.49 

In [5]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -192.0
Standard deviation of reward: 0.0
Average successful assignments: 4.466666666666667
All assignments history: [2, 7, 6, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -189     |
| time/              |          |
|    fps             | 80       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -178.0
Standard deviation of reward: 0.0
Average successful assignments: 7.266666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 2           |
|    time_elapsed         | 26          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007536065 |
|    clip_fraction        | 0.0588      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.172      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.65        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0431     |
|    value_loss           | 17

-------- Rollout Summary --------
Total mean reward: -4.0
Standard deviation of reward: 0.0
Average successful assignments: 48.093333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 137         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011054646 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.0778      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.82        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 5.5

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 56.98148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 18          |
|    time_elapsed         | 245         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011281386 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.472       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 4.3

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 60.751282051282054
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 26          |
|    time_elapsed         | 355         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008717148 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.42       |
|    explained_variance   | 0.712       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.46        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0482     |
|    value_loss           | 3.1 

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 64.0
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 34          |
|    time_elapsed         | 467         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007842116 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.31       |
|    explained_variance   | 0.779       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 2.79        |
---

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 66.67460317460318
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -169        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 42          |
|    time_elapsed         | 567         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007958652 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.1        |
|    explained_variance   | 0.782       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.03        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0493     |
|    value_loss           | 2.56

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 69.31066666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -152        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 50          |
|    time_elapsed         | 673         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009029681 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.59       |
|    explained_variance   | 0.756       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.754       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0583     |
|    value_loss           | 2.31

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 71.96551724137932
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 76          |
|    iterations           | 58          |
|    time_elapsed         | 773         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008832949 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.18       |
|    explained_variance   | 0.719       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.779       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0534     |
|    value_loss           | 2.19

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 74.82828282828282
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -104         |
| time/                   |              |
|    fps                  | 77           |
|    iterations           | 66           |
|    time_elapsed         | 873          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0107815955 |
|    clip_fraction        | 0.229        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.86        |
|    explained_variance   | 0.637        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.746        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0588      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 78.17567567567568
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -81.7       |
| time/                   |             |
|    fps                  | 77          |
|    iterations           | 74          |
|    time_elapsed         | 973         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.010705052 |
|    clip_fraction        | 0.237       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.6        |
|    explained_variance   | 0.69        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.544       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.059      |
|    value_loss           | 2   

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 81.32845528455285
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -64.2        |
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 82           |
|    time_elapsed         | 1070         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0093439575 |
|    clip_fraction        | 0.212        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.36        |
|    explained_variance   | 0.601        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.767        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0547      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 84.47555555555556
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -50.7       |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 90          |
|    time_elapsed         | 1167        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009432945 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.15       |
|    explained_variance   | 0.661       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.993       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0546     |
|    value_loss           | 3.01

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 87.34081632653061
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -37         |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 98          |
|    time_elapsed         | 1265        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.011246156 |
|    clip_fraction        | 0.222       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.98       |
|    explained_variance   | 0.742       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.819       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0503     |
|    value_loss           | 2.4 

In [6]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -190.0
Standard deviation of reward: 0.0
Average successful assignments: 5.133333333333334
All assignments history: [6, 7, 5, 3, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -189     |
| time/              |          |
|    fps             | 93       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -164.0
Standard deviation of reward: 0.0
Average successful assignments: 9.633333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -188        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007429569 |
|    clip_fraction        | 0.0597      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.186      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.29        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0436     |
|    value_loss           | 18

-------- Rollout Summary --------
Total mean reward: -102.0
Standard deviation of reward: 0.0
Average successful assignments: 22.246666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 10          |
|    time_elapsed         | 119         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012067819 |
|    clip_fraction        | 0.232       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.107       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.899       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0481     |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 43.15555555555556
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 18          |
|    time_elapsed         | 216         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010769431 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.437       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.37        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.047      |
|    value_loss           | 4.39

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 54.36153846153846
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -179        |
| time/                   |             |
|    fps                  | 84          |
|    iterations           | 26          |
|    time_elapsed         | 313         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.010801803 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.39       |
|    explained_variance   | 0.59        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.47        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0538     |
|    value_loss           | 3.68

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 60.94705882352941
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 34          |
|    time_elapsed         | 406         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008886927 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.711       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.247       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.052      |
|    value_loss           | 2.93

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 66.07301587301588
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -165        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 42          |
|    time_elapsed         | 496         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.009898182 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.08       |
|    explained_variance   | 0.767       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.619       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0552     |
|    value_loss           | 2.38

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 70.24666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -151        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 50          |
|    time_elapsed         | 586         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010253901 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.64       |
|    explained_variance   | 0.708       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.461       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0588     |
|    value_loss           | 2.29

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 74.01379310344828
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -128         |
| time/                   |              |
|    fps                  | 87           |
|    iterations           | 58           |
|    time_elapsed         | 675          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0105235055 |
|    clip_fraction        | 0.232        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.11        |
|    explained_variance   | 0.644        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.773        |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0604      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 77.70202020202021
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -100         |
| time/                   |              |
|    fps                  | 88           |
|    iterations           | 66           |
|    time_elapsed         | 763          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0077113463 |
|    clip_fraction        | 0.136        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.65        |
|    explained_variance   | 0.608        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.807        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0478      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 81.02432432432433
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -73.7        |
| time/                   |              |
|    fps                  | 88           |
|    iterations           | 74           |
|    time_elapsed         | 851          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0074135745 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.33        |
|    explained_variance   | 0.597        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.47         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0469      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 83.95121951219512
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -53.3        |
| time/                   |              |
|    fps                  | 89           |
|    iterations           | 82           |
|    time_elapsed         | 940          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0077678636 |
|    clip_fraction        | 0.144        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.07        |
|    explained_variance   | 0.584        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.858        |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0436      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 86.47111111111111
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -40.8       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 90          |
|    time_elapsed         | 1029        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008141235 |
|    clip_fraction        | 0.165       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.97       |
|    explained_variance   | 0.565       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.846       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 2.96

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 88.68707482993197
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -33.4       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 98          |
|    time_elapsed         | 1122        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008622419 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.05       |
|    explained_variance   | 0.628       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.662       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 2.21

In [7]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -180.0
Standard deviation of reward: 0.0
Average successful assignments: 8.933333333333334
All assignments history: [7, 8, 4, 7, 8, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 100      |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -112.0
Standard deviation of reward: 0.0
Average successful assignments: 20.1
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008150047 |
|    clip_fraction        | 0.0803      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.199      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.59        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 17.4        |
-

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 38.49333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 95          |
|    iterations           | 10          |
|    time_elapsed         | 106         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012213819 |
|    clip_fraction        | 0.217       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.175       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.972       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0483     |
|    value_loss           | 5.17 

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 52.58518518518518
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -184       |
| time/                   |            |
|    fps                  | 94         |
|    iterations           | 18         |
|    time_elapsed         | 194        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.01264477 |
|    clip_fraction        | 0.225      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.46      |
|    explained_variance   | 0.582      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.85       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.0506    |
|    value_loss           | 3.69       |
-----------

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 59.31025641025641
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -182       |
| time/                   |            |
|    fps                  | 94         |
|    iterations           | 26         |
|    time_elapsed         | 281        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.01112232 |
|    clip_fraction        | 0.219      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.4       |
|    explained_variance   | 0.72       |
|    learning_rate        | 0.00018    |
|    loss                 | 1.39       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0551    |
|    value_loss           | 3.15       |
----------

-------- Rollout Summary --------
Total mean reward: 44.0
Standard deviation of reward: 0.0
Average successful assignments: 65.23333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 34          |
|    time_elapsed         | 367         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009600017 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.32       |
|    explained_variance   | 0.796       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.908       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0527     |
|    value_loss           | 2.79

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 69.77777777777777
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 42          |
|    time_elapsed         | 456         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010159197 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.09       |
|    explained_variance   | 0.797       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.941       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0576     |
|    value_loss           | 2.42

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 73.7
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -151       |
| time/                   |            |
|    fps                  | 94         |
|    iterations           | 50         |
|    time_elapsed         | 544        |
|    total_timesteps      | 51200      |
| train/                  |            |
|    approx_kl            | 0.00855818 |
|    clip_fraction        | 0.157      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.68      |
|    explained_variance   | 0.739      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.703      |
|    n_updates            | 490        |
|    policy_gradient_loss | -0.051     |
|    value_loss           | 2.62       |
-----------------------

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 77.58390804597701
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -129        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 58          |
|    time_elapsed         | 633         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009911502 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.3        |
|    explained_variance   | 0.695       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.735       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 2.27

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 81.20505050505051
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -103        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 66          |
|    time_elapsed         | 723         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009421788 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.635       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.928       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0571     |
|    value_loss           | 2.43

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 84.65855855855855
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -82         |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 74          |
|    time_elapsed         | 811         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.008741219 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.65       |
|    explained_variance   | 0.494       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.851       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 2.4 

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 88.0390243902439
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62.8       |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 82          |
|    time_elapsed         | 901         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.010173685 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.31       |
|    explained_variance   | 0.447       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.949       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 2.41 

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 90.9437037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -46.4       |
| time/                   |             |
|    fps                  | 92          |
|    iterations           | 90          |
|    time_elapsed         | 991         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009637703 |
|    clip_fraction        | 0.227       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.11       |
|    explained_variance   | 0.362       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.835       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.33 

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 93.57006802721088
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -32.8       |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 98          |
|    time_elapsed         | 1078        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009215064 |
|    clip_fraction        | 0.219       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.03       |
|    explained_variance   | 0.377       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.848       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0541     |
|    value_loss           | 2.62

In [8]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -178.0
Standard deviation of reward: 0.0
Average successful assignments: 9.066666666666666
All assignments history: [4, 4, 8, 5, 5, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -190     |
| time/              |          |
|    fps             | 120      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -162.0
Standard deviation of reward: 0.0
Average successful assignments: 11.7
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -190        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007517432 |
|    clip_fraction        | 0.064       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.135      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.6         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0442     |
|    value_loss           | 16.1        |
-

-------- Rollout Summary --------
Total mean reward: -72.0
Standard deviation of reward: 0.0
Average successful assignments: 29.24
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 94          |
|    iterations           | 10          |
|    time_elapsed         | 108         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012049302 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.146       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.9         |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 5.19        |
-

-------- Rollout Summary --------
Total mean reward: -52.0
Standard deviation of reward: 0.0
Average successful assignments: 36.94814814814815
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 18          |
|    time_elapsed         | 201         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010169744 |
|    clip_fraction        | 0.181       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.45       |
|    explained_variance   | 0.496       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.53        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 4.0

-------- Rollout Summary --------
Total mean reward: -32.0
Standard deviation of reward: 0.0
Average successful assignments: 43.82820512820513
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 26          |
|    time_elapsed         | 297         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009123975 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.609       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0479     |
|    value_loss           | 3.8

-------- Rollout Summary --------
Total mean reward: 2.0
Standard deviation of reward: 0.0
Average successful assignments: 49.641176470588235
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 34          |
|    time_elapsed         | 392         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.011386182 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.29       |
|    explained_variance   | 0.708       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.95        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0573     |
|    value_loss           | 2.93

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 55.15079365079365
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 490         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010839637 |
|    clip_fraction        | 0.213       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.1        |
|    explained_variance   | 0.731       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.456       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0575     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 59.894666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -153        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 588         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.010781422 |
|    clip_fraction        | 0.206       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.76       |
|    explained_variance   | 0.686       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.923       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0582     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 64.4183908045977
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -133        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 685         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010691473 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.26       |
|    explained_variance   | 0.627       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.657       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0584     |
|    value_loss           | 2.37 

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 68.98787878787878
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -109        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 66          |
|    time_elapsed         | 781         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010416523 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.98       |
|    explained_variance   | 0.601       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0589     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 73.32162162162162
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -87.3       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 74          |
|    time_elapsed         | 878         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.011045681 |
|    clip_fraction        | 0.244       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.7        |
|    explained_variance   | 0.6         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.627       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0619     |
|    value_loss           | 2.16

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 77.27235772357723
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -71.3      |
| time/                   |            |
|    fps                  | 85         |
|    iterations           | 82         |
|    time_elapsed         | 977        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.01200106 |
|    clip_fraction        | 0.259      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.6       |
|    explained_variance   | 0.53       |
|    learning_rate        | 0.00018    |
|    loss                 | 0.703      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0572    |
|    value_loss           | 2.08       |
----------

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 80.69481481481482
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -60.4        |
| time/                   |              |
|    fps                  | 85           |
|    iterations           | 90           |
|    time_elapsed         | 1074         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0103101265 |
|    clip_fraction        | 0.218        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.45        |
|    explained_variance   | 0.564        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.567        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.055       |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 83.8061224489796
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -49.7       |
| time/                   |             |
|    fps                  | 85          |
|    iterations           | 98          |
|    time_elapsed         | 1171        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009417584 |
|    clip_fraction        | 0.2         |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.34       |
|    explained_variance   | 0.527       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.931       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0522     |
|    value_loss           | 2.38 

In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.333333333333333
All assignments history: [8, 8, 4, 7, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 92       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -152.0
Standard deviation of reward: 0.0
Average successful assignments: 12.366666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 2           |
|    time_elapsed         | 22          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008331743 |
|    clip_fraction        | 0.0784      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.21       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.47        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0477     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -42.0
Standard deviation of reward: 0.0
Average successful assignments: 35.34
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 91          |
|    iterations           | 10          |
|    time_elapsed         | 111         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011957361 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.0562      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 5.28        |
-

-------- Rollout Summary --------
Total mean reward: -10.0
Standard deviation of reward: 0.0
Average successful assignments: 48.762962962962966
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 18          |
|    time_elapsed         | 205         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012057506 |
|    clip_fraction        | 0.238       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.483       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.42        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0515     |
|    value_loss           | 4.

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 56.812820512820515
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 26          |
|    time_elapsed         | 302         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009708166 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.39       |
|    explained_variance   | 0.631       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.848       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.049      |
|    value_loss           | 3.7

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 63.22156862745098
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 34          |
|    time_elapsed         | 397         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008823361 |
|    clip_fraction        | 0.143       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.697       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.56        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.05       |
|    value_loss           | 3.14

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 68.01904761904763
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -164        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 492         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008008204 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.97       |
|    explained_variance   | 0.708       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.656       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0489     |
|    value_loss           | 2.75

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 72.02
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -146        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 50          |
|    time_elapsed         | 589         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008879071 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.59       |
|    explained_variance   | 0.742       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.797       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0537     |
|    value_loss           | 2.76        |
--

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 75.92068965517241
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 58          |
|    time_elapsed         | 683         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008354273 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.1        |
|    explained_variance   | 0.667       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.862       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.051      |
|    value_loss           | 2.58

-------- Rollout Summary --------
Total mean reward: 58.0
Standard deviation of reward: 0.0
Average successful assignments: 79.48383838383839
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -96.5       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 66          |
|    time_elapsed         | 781         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009139878 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.77       |
|    explained_variance   | 0.7         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.554       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 2.04

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 82.82702702702703
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -74.4       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 74          |
|    time_elapsed         | 877         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009279141 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.47       |
|    explained_variance   | 0.683       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.527       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0529     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 86.13089430894308
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -55.5       |
| time/                   |             |
|    fps                  | 86          |
|    iterations           | 82          |
|    time_elapsed         | 967         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.010959687 |
|    clip_fraction        | 0.254       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.24       |
|    explained_variance   | 0.425       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0606     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 89.17333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -41.3       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 90          |
|    time_elapsed         | 1058        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009305922 |
|    clip_fraction        | 0.205       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.08       |
|    explained_variance   | 0.426       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.793       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0542     |
|    value_loss           | 2.06

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 91.9360544217687
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -31.5       |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 98          |
|    time_elapsed         | 1149        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009268989 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.502       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.86        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0498     |
|    value_loss           | 2.16 

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -164.0
Standard deviation of reward: 0.0
Average successful assignments: 14.0
All assignments history: [4, 9, 5, 6, 6, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 99       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -178.0
Standard deviation of reward: 0.0
Average successful assignments: 11.633333333333333
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -188       |
| time/                   |            |
|    fps                  | 93         |
|    iterations           | 2          |
|    time_elapsed         | 21         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00780831 |
|    clip_fraction        | 0.0721     |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.51      |
|    explained_variance   | -0.199     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.6        |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0434    |
|    value_loss           | 16         |
-------

-------- Rollout Summary --------
Total mean reward: -20.0
Standard deviation of reward: 0.0
Average successful assignments: 40.346666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 10          |
|    time_elapsed         | 114         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011797254 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.123       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.36        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0462     |
|    value_loss           | 5.

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 53.82222222222222
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 18          |
|    time_elapsed         | 207         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010174727 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.43       |
|    explained_variance   | 0.582       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.823       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0495     |
|    value_loss           | 3.6 

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 58.87948717948718
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -181       |
| time/                   |            |
|    fps                  | 88         |
|    iterations           | 26         |
|    time_elapsed         | 300        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.01004138 |
|    clip_fraction        | 0.175      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.37      |
|    explained_variance   | 0.732      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.902      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0513    |
|    value_loss           | 3.04       |
----------

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 63.30196078431373
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -174         |
| time/                   |              |
|    fps                  | 88           |
|    iterations           | 34           |
|    time_elapsed         | 395          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0094952565 |
|    clip_fraction        | 0.174        |
|    clip_range           | 0.15         |
|    entropy_loss         | -6.23        |
|    explained_variance   | 0.789        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.348        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0541      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 66.53333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -163        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 42          |
|    time_elapsed         | 489         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010178369 |
|    clip_fraction        | 0.207       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.91       |
|    explained_variance   | 0.786       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.69        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0591     |
|    value_loss           | 2.35

-------- Rollout Summary --------
Total mean reward: 28.0
Standard deviation of reward: 0.0
Average successful assignments: 69.84533333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -143        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 50          |
|    time_elapsed         | 583         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008449558 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.772       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.551       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0501     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 73.6264367816092
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -115        |
| time/                   |             |
|    fps                  | 87          |
|    iterations           | 58          |
|    time_elapsed         | 677         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007978044 |
|    clip_fraction        | 0.152       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.88       |
|    explained_variance   | 0.706       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.774       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0474     |
|    value_loss           | 2.23 

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 77.74242424242425
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -89.1        |
| time/                   |              |
|    fps                  | 87           |
|    iterations           | 66           |
|    time_elapsed         | 768          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0088476995 |
|    clip_fraction        | 0.18         |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.63        |
|    explained_variance   | 0.626        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.549        |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0523      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 81.5063063063063
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -68.5       |
| time/                   |             |
|    fps                  | 88          |
|    iterations           | 74          |
|    time_elapsed         | 858         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009725235 |
|    clip_fraction        | 0.196       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.35       |
|    explained_variance   | 0.591       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.838       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0523     |
|    value_loss           | 2.45 

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 84.76585365853659
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -53.3       |
| time/                   |             |
|    fps                  | 89          |
|    iterations           | 82          |
|    time_elapsed         | 940         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009632824 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.1        |
|    explained_variance   | 0.54        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.734       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0502     |
|    value_loss           | 2.18

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 87.79851851851852
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -37.3       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 90          |
|    time_elapsed         | 1022        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009784815 |
|    clip_fraction        | 0.195       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.91       |
|    explained_variance   | 0.573       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.557       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0507     |
|    value_loss           | 2.12

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 90.69863945578231
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -25.7       |
| time/                   |             |
|    fps                  | 90          |
|    iterations           | 98          |
|    time_elapsed         | 1105        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.008016904 |
|    clip_fraction        | 0.162       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.87       |
|    explained_variance   | 0.58        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.544       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0473     |
|    value_loss           | 2.26

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -186.0
Standard deviation of reward: 0.0
Average successful assignments: 7.0
All assignments history: [9, 7, 3, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -186     |
| time/              |          |
|    fps             | 102      |
|    iterations      | 1        |
|    time_elapsed    | 9        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -184.0
Standard deviation of reward: 0.0
Average successful assignments: 7.3
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 101         |
|    iterations           | 2           |
|    time_elapsed         | 20          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007971896 |
|    clip_fraction        | 0.065       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.0933     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.9         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0455     |
|    value_loss           | 17          |
--

-------- Rollout Summary --------
Total mean reward: -18.0
Standard deviation of reward: 0.0
Average successful assignments: 38.07333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 10          |
|    time_elapsed         | 101         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012227878 |
|    clip_fraction        | 0.235       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.0843      |
|    learning_rate        | 0.00018     |
|    loss                 | 0.471       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0484     |
|    value_loss           | 5.2

-------- Rollout Summary --------
Total mean reward: -22.0
Standard deviation of reward: 0.0
Average successful assignments: 49.21111111111111
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 200       |
|    ep_rew_mean          | -183      |
| time/                   |           |
|    fps                  | 100       |
|    iterations           | 18        |
|    time_elapsed         | 184       |
|    total_timesteps      | 18432     |
| train/                  |           |
|    approx_kl            | 0.0103609 |
|    clip_fraction        | 0.195     |
|    clip_range           | 0.15      |
|    entropy_loss         | -6.44     |
|    explained_variance   | 0.543     |
|    learning_rate        | 0.00018   |
|    loss                 | 1.37      |
|    n_updates            | 170       |
|    policy_gradient_loss | -0.0522   |
|    value_loss           | 3.97      |
-----------------------------

-------- Rollout Summary --------
Total mean reward: 18.0
Standard deviation of reward: 0.0
Average successful assignments: 56.12564102564102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 100         |
|    iterations           | 26          |
|    time_elapsed         | 265         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009018665 |
|    clip_fraction        | 0.144       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.4        |
|    explained_variance   | 0.651       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.25        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0503     |
|    value_loss           | 3.69

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 60.44313725490196
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -176        |
| time/                   |             |
|    fps                  | 101         |
|    iterations           | 34          |
|    time_elapsed         | 341         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009353068 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.28       |
|    explained_variance   | 0.718       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.929       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0532     |
|    value_loss           | 3.09

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 64.72063492063492
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -167        |
| time/                   |             |
|    fps                  | 104         |
|    iterations           | 42          |
|    time_elapsed         | 413         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.011079861 |
|    clip_fraction        | 0.203       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.06       |
|    explained_variance   | 0.735       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.7         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0596     |
|    value_loss           | 2.67

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 68.79733333333333
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -150         |
| time/                   |              |
|    fps                  | 105          |
|    iterations           | 50           |
|    time_elapsed         | 484          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0100127775 |
|    clip_fraction        | 0.17         |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.65        |
|    explained_variance   | 0.734        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.837        |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0532      |
|    value_lo

-------- Rollout Summary --------
Total mean reward: 60.0
Standard deviation of reward: 0.0
Average successful assignments: 72.93218390804597
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -130        |
| time/                   |             |
|    fps                  | 106         |
|    iterations           | 58          |
|    time_elapsed         | 555         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.010261625 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.39       |
|    explained_variance   | 0.684       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.653       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0549     |
|    value_loss           | 2.36

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 76.7989898989899
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -112        |
| time/                   |             |
|    fps                  | 107         |
|    iterations           | 66          |
|    time_elapsed         | 627         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.010219179 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.17       |
|    explained_variance   | 0.634       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.315       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0562     |
|    value_loss           | 2.22 

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 80.38648648648649
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -95         |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 74          |
|    time_elapsed         | 701         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.011713086 |
|    clip_fraction        | 0.252       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.92       |
|    explained_variance   | 0.508       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.559       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.06       |
|    value_loss           | 2.33

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 83.53658536585365
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -78.5       |
| time/                   |             |
|    fps                  | 108         |
|    iterations           | 82          |
|    time_elapsed         | 773         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.010338813 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.65       |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.658       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0597     |
|    value_loss           | 2.27

-------- Rollout Summary --------
Total mean reward: 70.0
Standard deviation of reward: 0.0
Average successful assignments: 86.31555555555556
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -62.1       |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 90          |
|    time_elapsed         | 845         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.008885672 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.31       |
|    explained_variance   | 0.565       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.918       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0561     |
|    value_loss           | 2.39

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 88.8421768707483
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -47.4        |
| time/                   |              |
|    fps                  | 109          |
|    iterations           | 98           |
|    time_elapsed         | 916          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0077016214 |
|    clip_fraction        | 0.164        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.16        |
|    explained_variance   | 0.64         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.771        |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0502      |
|    value_los

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -194.0
Standard deviation of reward: 0.0
Average successful assignments: 4.2
All assignments history: [6, 8, 4, 9, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -187     |
| time/              |          |
|    fps             | 129      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -162.0
Standard deviation of reward: 0.0
Average successful assignments: 9.8
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -185        |
| time/                   |             |
|    fps                  | 118         |
|    iterations           | 2           |
|    time_elapsed         | 17          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.008153839 |
|    clip_fraction        | 0.0766      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.189      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.64        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 16.6        |
--

-------- Rollout Summary --------
Total mean reward: 6.0
Standard deviation of reward: 0.0
Average successful assignments: 30.593333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 10          |
|    time_elapsed         | 90          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.012946786 |
|    clip_fraction        | 0.235       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.47       |
|    explained_variance   | 0.0773      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.33        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0496     |
|    value_loss           | 4.96

-------- Rollout Summary --------
Total mean reward: -2.0
Standard deviation of reward: 0.0
Average successful assignments: 48.81481481481482
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 114         |
|    iterations           | 18          |
|    time_elapsed         | 160         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012627153 |
|    clip_fraction        | 0.259       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.44       |
|    explained_variance   | 0.515       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.683       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0534     |
|    value_loss           | 4.04

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 55.53589743589744
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -180        |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 26          |
|    time_elapsed         | 235         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.011241399 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.37       |
|    explained_variance   | 0.646       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.34        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0521     |
|    value_loss           | 3.72 

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 59.92156862745098
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -174       |
| time/                   |            |
|    fps                  | 107        |
|    iterations           | 34         |
|    time_elapsed         | 324        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00982852 |
|    clip_fraction        | 0.171      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.26      |
|    explained_variance   | 0.718      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.04       |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0519    |
|    value_loss           | 3.24       |
----------

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 63.48253968253968
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -164        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 42          |
|    time_elapsed         | 391         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.010560093 |
|    clip_fraction        | 0.22        |
|    clip_range           | 0.15        |
|    entropy_loss         | -6          |
|    explained_variance   | 0.7         |
|    learning_rate        | 0.00018     |
|    loss                 | 0.466       |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0579     |
|    value_loss           | 2.96

-------- Rollout Summary --------
Total mean reward: 30.0
Standard deviation of reward: 0.0
Average successful assignments: 66.75333333333333
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -149        |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 50          |
|    time_elapsed         | 463         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008613175 |
|    clip_fraction        | 0.16        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.56       |
|    explained_variance   | 0.677       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.753       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 2.59

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 70.90114942528736
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -125        |
| time/                   |             |
|    fps                  | 109         |
|    iterations           | 58          |
|    time_elapsed         | 540         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.009360204 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.593       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.649       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0552     |
|    value_loss           | 2.46

-------- Rollout Summary --------
Total mean reward: 54.0
Standard deviation of reward: 0.0
Average successful assignments: 74.97171717171717
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -97.6       |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 66          |
|    time_elapsed         | 614         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.009034155 |
|    clip_fraction        | 0.178       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.72       |
|    explained_variance   | 0.583       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.565       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.14

-------- Rollout Summary --------
Total mean reward: 68.0
Standard deviation of reward: 0.0
Average successful assignments: 78.75855855855856
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -75.2       |
| time/                   |             |
|    fps                  | 110         |
|    iterations           | 74          |
|    time_elapsed         | 685         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007934491 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.49       |
|    explained_variance   | 0.504       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.929       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0491     |
|    value_loss           | 2.4 

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 82.6130081300813
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -59.2      |
| time/                   |            |
|    fps                  | 111        |
|    iterations           | 82         |
|    time_elapsed         | 753        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00886287 |
|    clip_fraction        | 0.199      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.23      |
|    explained_variance   | 0.467      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.17       |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0525    |
|    value_loss           | 2.76       |
-----------

-------- Rollout Summary --------
Total mean reward: 78.0
Standard deviation of reward: 0.0
Average successful assignments: 85.99333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -46         |
| time/                   |             |
|    fps                  | 112         |
|    iterations           | 90          |
|    time_elapsed         | 819         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.009795154 |
|    clip_fraction        | 0.211       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.06       |
|    explained_variance   | 0.435       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.732       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0528     |
|    value_loss           | 2.49

-------- Rollout Summary --------
Total mean reward: 80.0
Standard deviation of reward: 0.0
Average successful assignments: 88.92448979591836
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -36         |
| time/                   |             |
|    fps                  | 113         |
|    iterations           | 98          |
|    time_elapsed         | 885         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.009024529 |
|    clip_fraction        | 0.192       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4          |
|    explained_variance   | 0.325       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.14        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0499     |
|    value_loss           | 2.62

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks200.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -188.0
Standard deviation of reward: 0.0
Average successful assignments: 6.066666666666666
All assignments history: [9, 5, 3, 5, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 200      |
|    ep_rew_mean     | -188     |
| time/              |          |
|    fps             | 133      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -154.0
Standard deviation of reward: 0.0
Average successful assignments: 11.833333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -187        |
| time/                   |             |
|    fps                  | 130         |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007679795 |
|    clip_fraction        | 0.0627      |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.51       |
|    explained_variance   | -0.279      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.37        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0448     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -158.0
Standard deviation of reward: 0.0
Average successful assignments: 16.493333333333332
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -186        |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 10          |
|    time_elapsed         | 80          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010858469 |
|    clip_fraction        | 0.188       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.48       |
|    explained_variance   | 0.155       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.609       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.046      |
|    value_loss           | 5

-------- Rollout Summary --------
Total mean reward: -6.0
Standard deviation of reward: 0.0
Average successful assignments: 32.84074074074074
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 18          |
|    time_elapsed         | 147         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.011681961 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.46       |
|    explained_variance   | 0.437       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.559       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0521     |
|    value_loss           | 4.37

-------- Rollout Summary --------
Total mean reward: 10.0
Standard deviation of reward: 0.0
Average successful assignments: 44.15128205128205
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -182        |
| time/                   |             |
|    fps                  | 121         |
|    iterations           | 26          |
|    time_elapsed         | 219         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009671431 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -6.41       |
|    explained_variance   | 0.639       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 3.55

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 51.60392156862745
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -177       |
| time/                   |            |
|    fps                  | 121        |
|    iterations           | 34         |
|    time_elapsed         | 286        |
|    total_timesteps      | 34816      |
| train/                  |            |
|    approx_kl            | 0.00887195 |
|    clip_fraction        | 0.155      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.31      |
|    explained_variance   | 0.724      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.324      |
|    n_updates            | 330        |
|    policy_gradient_loss | -0.0505    |
|    value_loss           | 3.01       |
-----------

-------- Rollout Summary --------
Total mean reward: 16.0
Standard deviation of reward: 0.0
Average successful assignments: 56.423809523809524
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 200        |
|    ep_rew_mean          | -170       |
| time/                   |            |
|    fps                  | 121        |
|    iterations           | 42         |
|    time_elapsed         | 353        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.01041457 |
|    clip_fraction        | 0.199      |
|    clip_range           | 0.15       |
|    entropy_loss         | -6.11      |
|    explained_variance   | 0.783      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.186      |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0582    |
|    value_loss           | 2.31       |
---------

-------- Rollout Summary --------
Total mean reward: 24.0
Standard deviation of reward: 0.0
Average successful assignments: 60.529333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -155        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 50          |
|    time_elapsed         | 419         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.009165545 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.73       |
|    explained_variance   | 0.767       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.648       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0514     |
|    value_loss           | 2.2

-------- Rollout Summary --------
Total mean reward: 34.0
Standard deviation of reward: 0.0
Average successful assignments: 64.72183908045977
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -135        |
| time/                   |             |
|    fps                  | 122         |
|    iterations           | 58          |
|    time_elapsed         | 483         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007971775 |
|    clip_fraction        | 0.161       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.35       |
|    explained_variance   | 0.77        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.642       |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0525     |
|    value_loss           | 1.9 

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 68.44444444444444
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -112        |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 66          |
|    time_elapsed         | 549         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007641277 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.99       |
|    explained_variance   | 0.736       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.694       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0476     |
|    value_loss           | 2.06

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 72.05945945945946
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -91.3       |
| time/                   |             |
|    fps                  | 123         |
|    iterations           | 74          |
|    time_elapsed         | 614         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.009091167 |
|    clip_fraction        | 0.176       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.77       |
|    explained_variance   | 0.58        |
|    learning_rate        | 0.00018     |
|    loss                 | 0.734       |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0543     |
|    value_loss           | 2.03

-------- Rollout Summary --------
Total mean reward: 72.0
Standard deviation of reward: 0.0
Average successful assignments: 75.87723577235772
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -73.3       |
| time/                   |             |
|    fps                  | 124         |
|    iterations           | 82          |
|    time_elapsed         | 674         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.009137412 |
|    clip_fraction        | 0.201       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.39       |
|    explained_variance   | 0.682       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.601       |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0548     |
|    value_loss           | 1.84

-------- Rollout Summary --------
Total mean reward: 76.0
Standard deviation of reward: 0.0
Average successful assignments: 79.76518518518519
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -53.2       |
| time/                   |             |
|    fps                  | 126         |
|    iterations           | 90          |
|    time_elapsed         | 731         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.010231259 |
|    clip_fraction        | 0.234       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.08       |
|    explained_variance   | 0.529       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.828       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0572     |
|    value_loss           | 2.01

-------- Rollout Summary --------
Total mean reward: 84.0
Standard deviation of reward: 0.0
Average successful assignments: 83.48571428571428
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 200         |
|    ep_rew_mean          | -32.7       |
| time/                   |             |
|    fps                  | 128         |
|    iterations           | 98          |
|    time_elapsed         | 782         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.011426208 |
|    clip_fraction        | 0.223       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.9        |
|    explained_variance   | 0.358       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.489       |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0492     |
|    value_loss           | 2.05