In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path)
    X = data.drop(columns=['Eligible'])
    y = data['Eligible']
    return X, y

def optimize_knn_model(X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    knn_model = KNeighborsRegressor()
    param_grid = {
        'n_neighbors': [3, 5, 10, 20, 30, 40],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
        'p': [1, 2]  # 1: Manhattan distance, 2: Euclidean distance
    }
    grid_search = GridSearchCV(knn_model, param_grid, cv=5, verbose=1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    print("Best parameters:", grid_search.best_params_)
    best_knn = grid_search.best_estimator_
    return best_knn, scaler

# Load and train on VehicleTrainingDataset.csv
X_train, y_train = load_and_preprocess_data('VehicleTrainingDataset_Noisy_0.01.csv')
knn_model, scaler = optimize_knn_model(X_train, y_train)

# Predict eligibility scores on 1000VehicleDataset.csv
vehicles_df = pd.read_csv('1000VehicleDataset_Noisy_0.01.csv')
X_test = vehicles_df.drop(columns=['Eligible'])
X_test_scaled = scaler.transform(X_test)
predicted_scores = knn_model.predict(X_test_scaled)

# Assuming you have access to actual scores, replace this line with the actual score loading logic if available
y_actual = vehicles_df['Eligible']

# Replace actual scores with predicted ones
vehicles_df['Eligible'] = predicted_scores  

# Calculate metrics
mae = mean_absolute_error(y_actual, predicted_scores)
rmse = np.sqrt(mean_squared_error(y_actual, predicted_scores))
r_squared = r2_score(y_actual, predicted_scores)
rae = np.sum(np.abs(y_actual - predicted_scores)) / np.sum(np.abs(y_actual - np.mean(y_actual)))

# Output the results
print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r_squared}")
print(f"RAE: {rae}")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'p': 1, 'weights': 'distance'}
MAE: 1.7258836094021666
RMSE: 2.159748576425794
R-squared: 0.9684512036561819
RAE: 0.18025169439477964


In [9]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 9.0
All assignments history: [15, 13, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 237      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -368.0
Standard deviation of reward: 0.0
Average successful assignments: 12.166666666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -374        |
| time/                   |             |
|    fps                  | 228         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007189866 |
|    clip_fraction        | 0.0666      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.189      |
|    learning_rate        | 0.00018     |
|    loss                 | 3.28        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0399     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -56.0
Standard deviation of reward: 0.0
Average successful assignments: 80.16666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 177         |
|    iterations           | 10          |
|    time_elapsed         | 57          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010091679 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.81       |
|    explained_variance   | 0.00684     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.58        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0417     |
|    value_loss           | 3.9

-------- Rollout Summary --------
Total mean reward: 38.0
Standard deviation of reward: 0.0
Average successful assignments: 123.82407407407408
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 150         |
|    iterations           | 18          |
|    time_elapsed         | 122         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010734821 |
|    clip_fraction        | 0.233       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.72       |
|    explained_variance   | 0.085       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.5         |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0449     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 50.0
Standard deviation of reward: 0.0
Average successful assignments: 144.87179487179486
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -349         |
| time/                   |              |
|    fps                  | 119          |
|    iterations           | 26           |
|    time_elapsed         | 222          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0066266293 |
|    clip_fraction        | 0.109        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.48        |
|    explained_variance   | 0.398        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.439        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0371      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 90.0
Standard deviation of reward: 0.0
Average successful assignments: 160.3921568627451
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 93          |
|    iterations           | 34          |
|    time_elapsed         | 371         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007321704 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.95       |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 2.02        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0412     |
|    value_loss           | 2.89

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 172.984126984127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -300        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 42          |
|    time_elapsed         | 523         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.006951593 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.34       |
|    explained_variance   | 0.672       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.3         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0395     |
|    value_loss           | 2.66

-------- Rollout Summary --------
Total mean reward: 128.0
Standard deviation of reward: 0.0
Average successful assignments: 183.98833333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -272        |
| time/                   |             |
|    fps                  | 75          |
|    iterations           | 50          |
|    time_elapsed         | 680         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006249497 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.81       |
|    explained_variance   | 0.571       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0377     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 142.0
Standard deviation of reward: 0.0
Average successful assignments: 193.2212643678161
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -224         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 58           |
|    time_elapsed         | 832          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0069167325 |
|    clip_fraction        | 0.131        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.46        |
|    explained_variance   | 0.484        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.1          |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0414      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 200.78156565656565
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -174        |
| time/                   |             |
|    fps                  | 68          |
|    iterations           | 66          |
|    time_elapsed         | 990         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007409986 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.25       |
|    explained_variance   | 0.475       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.974       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 207.18693693693695
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -123         |
| time/                   |              |
|    fps                  | 66           |
|    iterations           | 74           |
|    time_elapsed         | 1146         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0067935428 |
|    clip_fraction        | 0.124        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.07        |
|    explained_variance   | 0.438        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.68         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0352      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 212.6747967479675
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -79.8       |
| time/                   |             |
|    fps                  | 63          |
|    iterations           | 82          |
|    time_elapsed         | 1315        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006461747 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.1        |
|    explained_variance   | 0.443       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.38        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0361     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 217.43703703703704
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -50.6        |
| time/                   |              |
|    fps                  | 62           |
|    iterations           | 90           |
|    time_elapsed         | 1483         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0071944688 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.14        |
|    explained_variance   | 0.559        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.54         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0345      |
|    value_

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 221.50850340136054
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -30.9        |
| time/                   |              |
|    fps                  | 60           |
|    iterations           | 98           |
|    time_elapsed         | 1655         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0063486453 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.13        |
|    explained_variance   | 0.564        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.45         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0362      |
|    value_

In [10]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")




Using cpu device
-------- Rollout Summary --------
Total mean reward: -388.0
Standard deviation of reward: 0.0
Average successful assignments: 7.166666666666667
All assignments history: [11, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -374     |
| time/              |          |
|    fps             | 51       |
|    iterations      | 1        |
|    time_elapsed    | 20       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -210.0
Standard deviation of reward: 0.0
Average successful assignments: 44.166666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -375        |
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 2           |
|    time_elapsed         | 41          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007696669 |
|    clip_fraction        | 0.0805      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.0968     |
|    learning_rate        | 0.00018     |
|    loss                 | 2.71        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0398     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 138.19166666666666
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 47          |
|    iterations           | 10          |
|    time_elapsed         | 214         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009300368 |
|    clip_fraction        | 0.174       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.00276     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.731       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0391     |
|    value_loss           | 4.0

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 155.21759259259258
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 48          |
|    iterations           | 18          |
|    time_elapsed         | 377         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.012098573 |
|    clip_fraction        | 0.251       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.146       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.65        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 2.77

-------- Rollout Summary --------
Total mean reward: 22.0
Standard deviation of reward: 0.0
Average successful assignments: 162.06089743589743
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -345        |
| time/                   |             |
|    fps                  | 49          |
|    iterations           | 26          |
|    time_elapsed         | 538         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.009390842 |
|    clip_fraction        | 0.194       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.41       |
|    explained_variance   | 0.262       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.431       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0477     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 66.0
Standard deviation of reward: 0.0
Average successful assignments: 170.74019607843138
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -323        |
| time/                   |             |
|    fps                  | 50          |
|    iterations           | 34          |
|    time_elapsed         | 693         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008082647 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.9        |
|    explained_variance   | 0.491       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.4         |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0474     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 114.0
Standard deviation of reward: 0.0
Average successful assignments: 180.96825396825398
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -298        |
| time/                   |             |
|    fps                  | 50          |
|    iterations           | 42          |
|    time_elapsed         | 849         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008570278 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.33       |
|    explained_variance   | 0.5         |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0472     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 189.68166666666667
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -272      |
| time/                   |           |
|    fps                  | 51        |
|    iterations           | 50        |
|    time_elapsed         | 1002      |
|    total_timesteps      | 51200     |
| train/                  |           |
|    approx_kl            | 0.0077175 |
|    clip_fraction        | 0.16      |
|    clip_range           | 0.15      |
|    entropy_loss         | -3.91     |
|    explained_variance   | 0.475     |
|    learning_rate        | 0.00018   |
|    loss                 | 1.18      |
|    n_updates            | 490       |
|    policy_gradient_loss | -0.0451   |
|    value_loss           | 3.2       |
----------------------------

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 197.79885057471265
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -229         |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 58           |
|    time_elapsed         | 1151         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0061711054 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.59        |
|    explained_variance   | 0.32         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.36         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0382      |
|    value_

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 205.3800505050505
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -182         |
| time/                   |              |
|    fps                  | 51           |
|    iterations           | 66           |
|    time_elapsed         | 1304         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0062148916 |
|    clip_fraction        | 0.113        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.43        |
|    explained_variance   | 0.318        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.59         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0356      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 211.78716216216216
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -137        |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 74          |
|    time_elapsed         | 1456        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.005151497 |
|    clip_fraction        | 0.085       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.16       |
|    explained_variance   | 0.386       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.06        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0279     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 217.16260162601625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -95.7       |
| time/                   |             |
|    fps                  | 52          |
|    iterations           | 82          |
|    time_elapsed         | 1599        |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.004572831 |
|    clip_fraction        | 0.0783      |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.13       |
|    explained_variance   | 0.507       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 810         |
|    policy_gradient_loss | -0.0282     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 221.62962962962962
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -64.7      |
| time/                   |            |
|    fps                  | 52         |
|    iterations           | 90         |
|    time_elapsed         | 1741       |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00630557 |
|    clip_fraction        | 0.111      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.01      |
|    explained_variance   | 0.494      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.27       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.0342    |
|    value_loss           | 3.39       |
--------

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 225.5297619047619
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -42.8        |
| time/                   |              |
|    fps                  | 53           |
|    iterations           | 98           |
|    time_elapsed         | 1880         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0064094784 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.96        |
|    explained_variance   | 0.519        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.91         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0372      |
|    value_l

In [11]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -386.0
Standard deviation of reward: 0.0
Average successful assignments: 8.083333333333334
All assignments history: [13, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -373     |
| time/              |          |
|    fps             | 64       |
|    iterations      | 1        |
|    time_elapsed    | 15       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 8.958333333333334
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -372         |
| time/                   |              |
|    fps                  | 61           |
|    iterations           | 2            |
|    time_elapsed         | 33           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0077928966 |
|    clip_fraction        | 0.0876       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.87        |
|    explained_variance   | -0.204       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.95         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0404      |
|    value_

-------- Rollout Summary --------
Total mean reward: 8.0
Standard deviation of reward: 0.0
Average successful assignments: 93.475
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 10          |
|    time_elapsed         | 171         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010416912 |
|    clip_fraction        | 0.184       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | -0.0025     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.17        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 3.98        |
--

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 134.12962962962962
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 18          |
|    time_elapsed         | 315         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010803979 |
|    clip_fraction        | 0.225       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.165       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.161       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0468     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 158.64743589743588
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -346       |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 26         |
|    time_elapsed         | 454        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00926126 |
|    clip_fraction        | 0.185      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.51      |
|    explained_variance   | 0.351      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.66       |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0472    |
|    value_loss           | 3.2        |
---------

-------- Rollout Summary --------
Total mean reward: 98.0
Standard deviation of reward: 0.0
Average successful assignments: 172.27941176470588
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 34          |
|    time_elapsed         | 594         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.009633814 |
|    clip_fraction        | 0.208       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.455       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.725       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0536     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 112.0
Standard deviation of reward: 0.0
Average successful assignments: 182.9920634920635
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -301       |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 42         |
|    time_elapsed         | 731        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00825507 |
|    clip_fraction        | 0.153      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.43      |
|    explained_variance   | 0.648      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.28       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.046     |
|    value_loss           | 3.26       |
---------

-------- Rollout Summary --------
Total mean reward: 124.0
Standard deviation of reward: 0.0
Average successful assignments: 191.94833333333332
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -273         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 50           |
|    time_elapsed         | 872          |
|    total_timesteps      | 51200        |
| train/                  |              |
|    approx_kl            | 0.0072848536 |
|    clip_fraction        | 0.112        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.92        |
|    explained_variance   | 0.693        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.15         |
|    n_updates            | 490          |
|    policy_gradient_loss | -0.0388      |
|    value_

-------- Rollout Summary --------
Total mean reward: 144.0
Standard deviation of reward: 0.0
Average successful assignments: 200.132183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -226        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 58          |
|    time_elapsed         | 1013        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006927549 |
|    clip_fraction        | 0.133       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.47       |
|    explained_variance   | 0.73        |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0421     |
|    value_loss           | 3.18

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 207.3409090909091
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -175        |
| time/                   |             |
|    fps                  | 58          |
|    iterations           | 66          |
|    time_elapsed         | 1155        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008272402 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.25       |
|    explained_variance   | 0.576       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 213.3536036036036
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -125         |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 74           |
|    time_elapsed         | 1298         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0074330387 |
|    clip_fraction        | 0.148        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.12        |
|    explained_variance   | 0.555        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.07         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0385      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 218.3770325203252
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -81.8      |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 82         |
|    time_elapsed         | 1436       |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00763347 |
|    clip_fraction        | 0.167      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.14      |
|    explained_variance   | 0.463      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.21       |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0433    |
|    value_loss           | 3.02       |
---------

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 222.55277777777778
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -49.1        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1574         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0068429955 |
|    clip_fraction        | 0.121        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.14        |
|    explained_variance   | 0.49         |
|    learning_rate        | 0.00018      |
|    loss                 | 0.945        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0339      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 226.05102040816325
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -27.3      |
| time/                   |            |
|    fps                  | 58         |
|    iterations           | 98         |
|    time_elapsed         | 1726       |
|    total_timesteps      | 100352     |
| train/                  |            |
|    approx_kl            | 0.00652683 |
|    clip_fraction        | 0.134      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.1       |
|    explained_variance   | 0.399      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.3        |
|    n_updates            | 970        |
|    policy_gradient_loss | -0.0393    |
|    value_loss           | 3.23       |
--------

In [12]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -374.0
Standard deviation of reward: 0.0
Average successful assignments: 13.833333333333334
All assignments history: [16, 20, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -364     |
| time/              |          |
|    fps             | 62       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -224.0
Standard deviation of reward: 0.0
Average successful assignments: 44.958333333333336
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 2           |
|    time_elapsed         | 33          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007979611 |
|    clip_fraction        | 0.0825      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.186      |
|    learning_rate        | 0.00018     |
|    loss                 | 1.45        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0398     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: 4.0
Standard deviation of reward: 0.0
Average successful assignments: 104.85
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -365        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 10          |
|    time_elapsed         | 183         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.009867515 |
|    clip_fraction        | 0.189       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.00706     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.618       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0407     |
|    value_loss           | 3.77        |
--

-------- Rollout Summary --------
Total mean reward: 14.0
Standard deviation of reward: 0.0
Average successful assignments: 136.33333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 18          |
|    time_elapsed         | 334         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.008618778 |
|    clip_fraction        | 0.167       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.7        |
|    explained_variance   | 0.136       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.802       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0407     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 151.91666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -347         |
| time/                   |              |
|    fps                  | 55           |
|    iterations           | 26           |
|    time_elapsed         | 482          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0074148215 |
|    clip_fraction        | 0.147        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.42        |
|    explained_variance   | 0.325        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.414        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0419      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 166.10049019607843
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -326        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 34          |
|    time_elapsed         | 634         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008590197 |
|    clip_fraction        | 0.164       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.87       |
|    explained_variance   | 0.407       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.57        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0447     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 178.90674603174602
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -298        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 42          |
|    time_elapsed         | 785         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008186616 |
|    clip_fraction        | 0.17        |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.18       |
|    explained_variance   | 0.572       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0469     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 189.05333333333334
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -267        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 50          |
|    time_elapsed         | 933         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006592503 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.73       |
|    explained_variance   | 0.619       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.038      |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 197.78735632183907
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -220        |
| time/                   |             |
|    fps                  | 55          |
|    iterations           | 58          |
|    time_elapsed         | 1078        |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007981771 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.43       |
|    explained_variance   | 0.524       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.41        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.045      |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 204.8560606060606
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -170         |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 66           |
|    time_elapsed         | 1229         |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0063205007 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.31        |
|    explained_variance   | 0.537        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.09         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.0329      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 210.69932432432432
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -122        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 74          |
|    time_elapsed         | 1380        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006318055 |
|    clip_fraction        | 0.128       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.15       |
|    explained_variance   | 0.527       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.26        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0378     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 215.58028455284554
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -78.5        |
| time/                   |              |
|    fps                  | 54           |
|    iterations           | 82           |
|    time_elapsed         | 1532         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0062717516 |
|    clip_fraction        | 0.112        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.465        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.18         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0304      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 219.5962962962963
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -48.5       |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 90          |
|    time_elapsed         | 1685        |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.006215049 |
|    clip_fraction        | 0.125       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.06       |
|    explained_variance   | 0.435       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.694       |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0365     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 222.98469387755102
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -31.3       |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 98          |
|    time_elapsed         | 1828        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.006304918 |
|    clip_fraction        | 0.126       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.08       |
|    explained_variance   | 0.426       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.72        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0373     |
|    value_loss           | 3.

In [13]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -366.0
Standard deviation of reward: 0.0
Average successful assignments: 16.5
All assignments history: [15, 13, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -372     |
| time/              |          |
|    fps             | 63       |
|    iterations      | 1        |
|    time_elapsed    | 16       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -274.0
Standard deviation of reward: 0.0
Average successful assignments: 35.875
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -370        |
| time/                   |             |
|    fps                  | 61          |
|    iterations           | 2           |
|    time_elapsed         | 33          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007068443 |
|    clip_fraction        | 0.0622      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.211      |
|    learning_rate        | 0.00018     |
|    loss                 | 4.12        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.04       |
|    value_loss           | 19.5        |

-------- Rollout Summary --------
Total mean reward: 12.0
Standard deviation of reward: 0.0
Average successful assignments: 124.575
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -366         |
| time/                   |              |
|    fps                  | 57           |
|    iterations           | 10           |
|    time_elapsed         | 176          |
|    total_timesteps      | 10240        |
| train/                  |              |
|    approx_kl            | 0.0090427175 |
|    clip_fraction        | 0.173        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.8         |
|    explained_variance   | -0.0107      |
|    learning_rate        | 0.00018      |
|    loss                 | 0.941        |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.0401      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 48.0
Standard deviation of reward: 0.0
Average successful assignments: 152.74074074074073
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -359        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 18          |
|    time_elapsed         | 322         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.007955778 |
|    clip_fraction        | 0.137       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.74       |
|    explained_variance   | 0.169       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.29        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0402     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 62.0
Standard deviation of reward: 0.0
Average successful assignments: 166.28205128205127
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -347        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 26          |
|    time_elapsed         | 470         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.007487299 |
|    clip_fraction        | 0.13        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.5        |
|    explained_variance   | 0.327       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.733       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0403     |
|    value_loss           | 2.8

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 175.94117647058823
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -329        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 34          |
|    time_elapsed         | 617         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.008906178 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.01       |
|    explained_variance   | 0.279       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.927       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0486     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 185.1547619047619
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -303        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 42          |
|    time_elapsed         | 764         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008935835 |
|    clip_fraction        | 0.179       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.41       |
|    explained_variance   | 0.302       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.11        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0505     |
|    value_loss           | 3.1

-------- Rollout Summary --------
Total mean reward: 134.0
Standard deviation of reward: 0.0
Average successful assignments: 194.68166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -275        |
| time/                   |             |
|    fps                  | 56          |
|    iterations           | 50          |
|    time_elapsed         | 909         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.008115007 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.89       |
|    explained_variance   | 0.378       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.21        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0467     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 203.11063218390805
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -227         |
| time/                   |              |
|    fps                  | 56           |
|    iterations           | 58           |
|    time_elapsed         | 1048         |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0075318543 |
|    clip_fraction        | 0.14         |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.51        |
|    explained_variance   | 0.283        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.23         |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0416      |
|    value_

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 210.47222222222223
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -177        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 66          |
|    time_elapsed         | 1177        |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006602818 |
|    clip_fraction        | 0.117       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.38       |
|    explained_variance   | 0.351       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.09        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0385     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 216.6036036036036
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -130        |
| time/                   |             |
|    fps                  | 57          |
|    iterations           | 74          |
|    time_elapsed         | 1308        |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006292311 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.24       |
|    explained_variance   | 0.338       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0355     |
|    value_loss           | 2.9

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 221.6290650406504
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -87.9        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 82           |
|    time_elapsed         | 1438         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0074111563 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.27        |
|    explained_variance   | 0.411        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.11         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0359      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 168.0
Standard deviation of reward: 0.0
Average successful assignments: 225.66944444444445
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -57.8        |
| time/                   |              |
|    fps                  | 58           |
|    iterations           | 90           |
|    time_elapsed         | 1567         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0076918416 |
|    clip_fraction        | 0.157        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.27        |
|    explained_variance   | 0.395        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.29         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0445      |
|    value_

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 229.33928571428572
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -36.6       |
| time/                   |             |
|    fps                  | 59          |
|    iterations           | 98          |
|    time_elapsed         | 1684        |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007053787 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.07       |
|    explained_variance   | 0.531       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.04        |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.04       |
|    value_loss           | 2.

In [14]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 14.916666666666666
All assignments history: [19, 10, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -371     |
| time/              |          |
|    fps             | 81       |
|    iterations      | 1        |
|    time_elapsed    | 12       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -356.0
Standard deviation of reward: 0.0
Average successful assignments: 17.541666666666668
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -374       |
| time/                   |            |
|    fps                  | 78         |
|    iterations           | 2          |
|    time_elapsed         | 26         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00943508 |
|    clip_fraction        | 0.117      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.87      |
|    explained_variance   | -0.205     |
|    learning_rate        | 0.00018    |
|    loss                 | 2.56       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0447    |
|    value_loss           | 16.3       |
-------

-------- Rollout Summary --------
Total mean reward: -24.0
Standard deviation of reward: 0.0
Average successful assignments: 71.41666666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -364        |
| time/                   |             |
|    fps                  | 74          |
|    iterations           | 10          |
|    time_elapsed         | 137         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.011409399 |
|    clip_fraction        | 0.249       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.79       |
|    explained_variance   | -0.00218    |
|    learning_rate        | 0.00018     |
|    loss                 | 0.477       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0468     |
|    value_loss           | 3.6

-------- Rollout Summary --------
Total mean reward: 36.0
Standard deviation of reward: 0.0
Average successful assignments: 117.56481481481481
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -355       |
| time/                   |            |
|    fps                  | 73         |
|    iterations           | 18         |
|    time_elapsed         | 249        |
|    total_timesteps      | 18432      |
| train/                  |            |
|    approx_kl            | 0.00964279 |
|    clip_fraction        | 0.203      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.69      |
|    explained_variance   | 0.101      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.11       |
|    n_updates            | 170        |
|    policy_gradient_loss | -0.044     |
|    value_loss           | 2.9        |
---------

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 140.44551282051282
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -342        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 26          |
|    time_elapsed         | 362         |
|    total_timesteps      | 26624       |
| train/                  |             |
|    approx_kl            | 0.008610394 |
|    clip_fraction        | 0.156       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.4        |
|    explained_variance   | 0.367       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.571       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0428     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 56.0
Standard deviation of reward: 0.0
Average successful assignments: 154.6642156862745
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -319        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 34          |
|    time_elapsed         | 473         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007959422 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.83       |
|    explained_variance   | 0.632       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.832       |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 3.38

-------- Rollout Summary --------
Total mean reward: 74.0
Standard deviation of reward: 0.0
Average successful assignments: 166.06944444444446
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -293        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 42          |
|    time_elapsed         | 588         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.007310168 |
|    clip_fraction        | 0.106       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.39       |
|    explained_variance   | 0.728       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.35        |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 96.0
Standard deviation of reward: 0.0
Average successful assignments: 175.39333333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -269        |
| time/                   |             |
|    fps                  | 73          |
|    iterations           | 50          |
|    time_elapsed         | 698         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.006759896 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.93       |
|    explained_variance   | 0.721       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.34        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0427     |
|    value_loss           | 3.4

-------- Rollout Summary --------
Total mean reward: 118.0
Standard deviation of reward: 0.0
Average successful assignments: 184.19396551724137
All assignments history: []
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 400       |
|    ep_rew_mean          | -226      |
| time/                   |           |
|    fps                  | 71        |
|    iterations           | 58        |
|    time_elapsed         | 829       |
|    total_timesteps      | 59392     |
| train/                  |           |
|    approx_kl            | 0.0069672 |
|    clip_fraction        | 0.115     |
|    clip_range           | 0.15      |
|    entropy_loss         | -3.62     |
|    explained_variance   | 0.6       |
|    learning_rate        | 0.00018   |
|    loss                 | 1.36      |
|    n_updates            | 570       |
|    policy_gradient_loss | -0.0382   |
|    value_loss           | 3.46      |
----------------------------

-------- Rollout Summary --------
Total mean reward: 126.0
Standard deviation of reward: 0.0
Average successful assignments: 191.70580808080808
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -181        |
| time/                   |             |
|    fps                  | 72          |
|    iterations           | 66          |
|    time_elapsed         | 936         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.006440198 |
|    clip_fraction        | 0.125       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.48       |
|    explained_variance   | 0.537       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.41        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0393     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 197.86261261261262
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -138         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 74           |
|    time_elapsed         | 1055         |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0070578773 |
|    clip_fraction        | 0.133        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.33        |
|    explained_variance   | 0.52         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.21         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0402      |
|    value_

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 203.4481707317073
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -101         |
| time/                   |              |
|    fps                  | 71           |
|    iterations           | 82           |
|    time_elapsed         | 1168         |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0071921526 |
|    clip_fraction        | 0.141        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.23        |
|    explained_variance   | 0.405        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.26         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0402      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 208.52314814814815
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -72          |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 90           |
|    time_elapsed         | 1272         |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0069522597 |
|    clip_fraction        | 0.128        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.05        |
|    explained_variance   | 0.346        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.44         |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0363      |
|    value_

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 212.94727891156464
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -49.6        |
| time/                   |              |
|    fps                  | 72           |
|    iterations           | 98           |
|    time_elapsed         | 1376         |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0070138713 |
|    clip_fraction        | 0.115        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.93        |
|    explained_variance   | 0.418        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.2          |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0363      |
|    value_

In [15]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -216.0
Standard deviation of reward: 0.0
Average successful assignments: 79.83333333333333
All assignments history: [18, 20, 92, 92, 92, 92, 92, 92, 92, 92, 92, 92]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -362     |
| time/              |          |
|    fps             | 89       |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -348.0
Standard deviation of reward: 0.0
Average successful assignments: 51.791666666666664
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -368        |
| time/                   |             |
|    fps                  | 82          |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007593554 |
|    clip_fraction        | 0.078       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.269      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.88        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0416     |
|    value_loss           | 1

-------- Rollout Summary --------
Total mean reward: -92.0
Standard deviation of reward: 0.0
Average successful assignments: 69.05
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -366        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 10          |
|    time_elapsed         | 126         |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010382418 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.81       |
|    explained_variance   | 9.34e-05    |
|    learning_rate        | 0.00018     |
|    loss                 | 0.644       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0424     |
|    value_loss           | 4.01        |
-

-------- Rollout Summary --------
Total mean reward: 86.0
Standard deviation of reward: 0.0
Average successful assignments: 114.92129629629629
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -360        |
| time/                   |             |
|    fps                  | 78          |
|    iterations           | 18          |
|    time_elapsed         | 235         |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.008807583 |
|    clip_fraction        | 0.155       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.72       |
|    explained_variance   | 0.202       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.298       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0377     |
|    value_loss           | 2.6

-------- Rollout Summary --------
Total mean reward: 102.0
Standard deviation of reward: 0.0
Average successful assignments: 144.28205128205127
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -347         |
| time/                   |              |
|    fps                  | 78           |
|    iterations           | 26           |
|    time_elapsed         | 338          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0083454475 |
|    clip_fraction        | 0.16         |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.51        |
|    explained_variance   | 0.367        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.6          |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0431      |
|    value_

-------- Rollout Summary --------
Total mean reward: 82.0
Standard deviation of reward: 0.0
Average successful assignments: 160.19607843137254
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -331         |
| time/                   |              |
|    fps                  | 79           |
|    iterations           | 34           |
|    time_elapsed         | 440          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0077815405 |
|    clip_fraction        | 0.134        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.04        |
|    explained_variance   | 0.464        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.911        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0433      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 136.0
Standard deviation of reward: 0.0
Average successful assignments: 173.57142857142858
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -308        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 42          |
|    time_elapsed         | 542         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008661952 |
|    clip_fraction        | 0.168       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.46       |
|    explained_variance   | 0.523       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0452     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 185.40166666666667
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -280        |
| time/                   |             |
|    fps                  | 79          |
|    iterations           | 50          |
|    time_elapsed         | 643         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007006927 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.88       |
|    explained_variance   | 0.624       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.13        |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0411     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 194.98994252873564
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -233        |
| time/                   |             |
|    fps                  | 81          |
|    iterations           | 58          |
|    time_elapsed         | 731         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.007293985 |
|    clip_fraction        | 0.129       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.42       |
|    explained_variance   | 0.584       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.56        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0385     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 150.0
Standard deviation of reward: 0.0
Average successful assignments: 202.75757575757575
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -183        |
| time/                   |             |
|    fps                  | 83          |
|    iterations           | 66          |
|    time_elapsed         | 808         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.007252267 |
|    clip_fraction        | 0.132       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.22       |
|    explained_variance   | 0.521       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.18        |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0349     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 209.0608108108108
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -135         |
| time/                   |              |
|    fps                  | 87           |
|    iterations           | 74           |
|    time_elapsed         | 864          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0069297105 |
|    clip_fraction        | 0.147        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.396        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.68         |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0409      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 214.25
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -91          |
| time/                   |              |
|    fps                  | 92           |
|    iterations           | 82           |
|    time_elapsed         | 911          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0060339617 |
|    clip_fraction        | 0.127        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.12        |
|    explained_variance   | 0.375        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.32         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0363      |
|    value_loss        

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 218.68148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -56         |
| time/                   |             |
|    fps                  | 97          |
|    iterations           | 90          |
|    time_elapsed         | 948         |
|    total_timesteps      | 92160       |
| train/                  |             |
|    approx_kl            | 0.007020833 |
|    clip_fraction        | 0.127       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.09       |
|    explained_variance   | 0.361       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.51        |
|    n_updates            | 890         |
|    policy_gradient_loss | -0.0339     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 222.54421768707482
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -34.4        |
| time/                   |              |
|    fps                  | 102          |
|    iterations           | 98           |
|    time_elapsed         | 982          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0061486987 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.342        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.62         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0375      |
|    value_

In [16]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -382.0
Standard deviation of reward: 0.0
Average successful assignments: 9.416666666666666
All assignments history: [9, 14, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -377     |
| time/              |          |
|    fps             | 251      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -334.0
Standard deviation of reward: 0.0
Average successful assignments: 19.958333333333332
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -370         |
| time/                   |              |
|    fps                  | 246          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0072409725 |
|    clip_fraction        | 0.0615       |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.87        |
|    explained_variance   | -0.231       |
|    learning_rate        | 0.00018      |
|    loss                 | 2.64         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0403      |
|    value

-------- Rollout Summary --------
Total mean reward: -54.0
Standard deviation of reward: 0.0
Average successful assignments: 96.01666666666667
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -365       |
| time/                   |            |
|    fps                  | 239        |
|    iterations           | 10         |
|    time_elapsed         | 42         |
|    total_timesteps      | 10240      |
| train/                  |            |
|    approx_kl            | 0.01068825 |
|    clip_fraction        | 0.201      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.81      |
|    explained_variance   | 0.00974    |
|    learning_rate        | 0.00018    |
|    loss                 | 2.07       |
|    n_updates            | 90         |
|    policy_gradient_loss | -0.0443    |
|    value_loss           | 4.13       |
---------

-------- Rollout Summary --------
Total mean reward: -28.0
Standard deviation of reward: 0.0
Average successful assignments: 123.63888888888889
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009713806 |
|    clip_fraction        | 0.191       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.72       |
|    explained_variance   | 0.152       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.32        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0428     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 26.0
Standard deviation of reward: 0.0
Average successful assignments: 138.50320512820514
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -344         |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 26           |
|    time_elapsed         | 111          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0073947757 |
|    clip_fraction        | 0.126        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.48        |
|    explained_variance   | 0.392        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.08         |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0424      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 52.0
Standard deviation of reward: 0.0
Average successful assignments: 151.44607843137254
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -326         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 34           |
|    time_elapsed         | 146          |
|    total_timesteps      | 34816        |
| train/                  |              |
|    approx_kl            | 0.0073706694 |
|    clip_fraction        | 0.123        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.97        |
|    explained_variance   | 0.446        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.717        |
|    n_updates            | 330          |
|    policy_gradient_loss | -0.0412      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 88.0
Standard deviation of reward: 0.0
Average successful assignments: 163.609126984127
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -299         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 42           |
|    time_elapsed         | 180          |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0072148745 |
|    clip_fraction        | 0.116        |
|    clip_range           | 0.15         |
|    entropy_loss         | -4.32        |
|    explained_variance   | 0.493        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.1          |
|    n_updates            | 410          |
|    policy_gradient_loss | -0.0392      |
|    value_los

-------- Rollout Summary --------
Total mean reward: 140.0
Standard deviation of reward: 0.0
Average successful assignments: 176.39666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -271        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 50          |
|    time_elapsed         | 214         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007376573 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.87       |
|    explained_variance   | 0.461       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.2         |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0451     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 187.11063218390805
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -225        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 58          |
|    time_elapsed         | 249         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.006810301 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.58       |
|    explained_variance   | 0.304       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.28        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0444     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 195.91666666666666
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -176         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 66           |
|    time_elapsed         | 284          |
|    total_timesteps      | 67584        |
| train/                  |              |
|    approx_kl            | 0.0067873895 |
|    clip_fraction        | 0.108        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.36        |
|    explained_variance   | 0.443        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.25         |
|    n_updates            | 650          |
|    policy_gradient_loss | -0.035       |
|    value_

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 203.46734234234233
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -127        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 74          |
|    time_elapsed         | 318         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.006972522 |
|    clip_fraction        | 0.136       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.18       |
|    explained_variance   | 0.515       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.13        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0395     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 164.0
Standard deviation of reward: 0.0
Average successful assignments: 209.7022357723577
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -83.9        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 82           |
|    time_elapsed         | 352          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0059659677 |
|    clip_fraction        | 0.106        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.17        |
|    explained_variance   | 0.55         |
|    learning_rate        | 0.00018      |
|    loss                 | 1.13         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0341      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 166.0
Standard deviation of reward: 0.0
Average successful assignments: 214.96481481481482
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -51.4      |
| time/                   |            |
|    fps                  | 238        |
|    iterations           | 90         |
|    time_elapsed         | 386        |
|    total_timesteps      | 92160      |
| train/                  |            |
|    approx_kl            | 0.00578739 |
|    clip_fraction        | 0.113      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.06      |
|    explained_variance   | 0.506      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.03       |
|    n_updates            | 890        |
|    policy_gradient_loss | -0.033     |
|    value_loss           | 3.31       |
--------

-------- Rollout Summary --------
Total mean reward: 170.0
Standard deviation of reward: 0.0
Average successful assignments: 219.4872448979592
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -29.5        |
| time/                   |              |
|    fps                  | 238          |
|    iterations           | 98           |
|    time_elapsed         | 421          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0065725287 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.577        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.39         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0375      |
|    value_l

In [17]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -384.0
Standard deviation of reward: 0.0
Average successful assignments: 8.166666666666666
All assignments history: [12, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -382     |
| time/              |          |
|    fps             | 255      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -286.0
Standard deviation of reward: 0.0
Average successful assignments: 29.375
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -372       |
| time/                   |            |
|    fps                  | 245        |
|    iterations           | 2          |
|    time_elapsed         | 8          |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.00726746 |
|    clip_fraction        | 0.0642     |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.87      |
|    explained_variance   | -0.105     |
|    learning_rate        | 0.00018    |
|    loss                 | 3.97       |
|    n_updates            | 10         |
|    policy_gradient_loss | -0.0377    |
|    value_loss           | 20.3       |
-------------------

-------- Rollout Summary --------
Total mean reward: -44.0
Standard deviation of reward: 0.0
Average successful assignments: 77.95
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 239         |
|    iterations           | 10          |
|    time_elapsed         | 42          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010195531 |
|    clip_fraction        | 0.202       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.00907     |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0435     |
|    value_loss           | 4.34        |
-

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 118.24537037037037
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -358        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 18          |
|    time_elapsed         | 77          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.009425357 |
|    clip_fraction        | 0.169       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.71       |
|    explained_variance   | 0.201       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.67        |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0426     |
|    value_loss           | 2.7

-------- Rollout Summary --------
Total mean reward: 40.0
Standard deviation of reward: 0.0
Average successful assignments: 140.91346153846155
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -346       |
| time/                   |            |
|    fps                  | 237        |
|    iterations           | 26         |
|    time_elapsed         | 111        |
|    total_timesteps      | 26624      |
| train/                  |            |
|    approx_kl            | 0.00883133 |
|    clip_fraction        | 0.148      |
|    clip_range           | 0.15       |
|    entropy_loss         | -5.5       |
|    explained_variance   | 0.292      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.801      |
|    n_updates            | 250        |
|    policy_gradient_loss | -0.0449    |
|    value_loss           | 3.12       |
---------

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 154.95098039215685
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -328        |
| time/                   |             |
|    fps                  | 238         |
|    iterations           | 34          |
|    time_elapsed         | 145         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007497576 |
|    clip_fraction        | 0.119       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.05       |
|    explained_variance   | 0.482       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.45        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.042      |
|    value_loss           | 3.2

-------- Rollout Summary --------
Total mean reward: 110.0
Standard deviation of reward: 0.0
Average successful assignments: 168.0595238095238
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -305       |
| time/                   |            |
|    fps                  | 238        |
|    iterations           | 42         |
|    time_elapsed         | 180        |
|    total_timesteps      | 43008      |
| train/                  |            |
|    approx_kl            | 0.00796662 |
|    clip_fraction        | 0.146      |
|    clip_range           | 0.15       |
|    entropy_loss         | -4.48      |
|    explained_variance   | 0.589      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.43       |
|    n_updates            | 410        |
|    policy_gradient_loss | -0.0445    |
|    value_loss           | 3.12       |
---------

-------- Rollout Summary --------
Total mean reward: 132.0
Standard deviation of reward: 0.0
Average successful assignments: 179.54833333333335
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -277        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 50          |
|    time_elapsed         | 215         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007728738 |
|    clip_fraction        | 0.166       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.89       |
|    explained_variance   | 0.707       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.949       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0466     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 138.0
Standard deviation of reward: 0.0
Average successful assignments: 189.257183908046
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -232        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 58          |
|    time_elapsed         | 249         |
|    total_timesteps      | 59392       |
| train/                  |             |
|    approx_kl            | 0.008244036 |
|    clip_fraction        | 0.172       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.65       |
|    explained_variance   | 0.599       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.02        |
|    n_updates            | 570         |
|    policy_gradient_loss | -0.0456     |
|    value_loss           | 2.86

-------- Rollout Summary --------
Total mean reward: 148.0
Standard deviation of reward: 0.0
Average successful assignments: 197.30429292929293
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -184        |
| time/                   |             |
|    fps                  | 237         |
|    iterations           | 66          |
|    time_elapsed         | 284         |
|    total_timesteps      | 67584       |
| train/                  |             |
|    approx_kl            | 0.008023874 |
|    clip_fraction        | 0.153       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.41       |
|    explained_variance   | 0.611       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.986       |
|    n_updates            | 650         |
|    policy_gradient_loss | -0.0409     |
|    value_loss           | 2.

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 204.07545045045046
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -138         |
| time/                   |              |
|    fps                  | 237          |
|    iterations           | 74           |
|    time_elapsed         | 318          |
|    total_timesteps      | 75776        |
| train/                  |              |
|    approx_kl            | 0.0061365613 |
|    clip_fraction        | 0.111        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.26        |
|    explained_variance   | 0.587        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.883        |
|    n_updates            | 730          |
|    policy_gradient_loss | -0.0377      |
|    value_

-------- Rollout Summary --------
Total mean reward: 154.0
Standard deviation of reward: 0.0
Average successful assignments: 209.7591463414634
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -95.8      |
| time/                   |            |
|    fps                  | 237        |
|    iterations           | 82         |
|    time_elapsed         | 353        |
|    total_timesteps      | 83968      |
| train/                  |            |
|    approx_kl            | 0.00707572 |
|    clip_fraction        | 0.122      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.26      |
|    explained_variance   | 0.466      |
|    learning_rate        | 0.00018    |
|    loss                 | 0.964      |
|    n_updates            | 810        |
|    policy_gradient_loss | -0.0368    |
|    value_loss           | 2.72       |
---------

-------- Rollout Summary --------
Total mean reward: 156.0
Standard deviation of reward: 0.0
Average successful assignments: 214.5388888888889
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -63.5        |
| time/                   |              |
|    fps                  | 236          |
|    iterations           | 90           |
|    time_elapsed         | 390          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0057142414 |
|    clip_fraction        | 0.117        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.25        |
|    explained_variance   | 0.507        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.921        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0361      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 218.59948979591837
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -43         |
| time/                   |             |
|    fps                  | 235         |
|    iterations           | 98          |
|    time_elapsed         | 425         |
|    total_timesteps      | 100352      |
| train/                  |             |
|    approx_kl            | 0.007841796 |
|    clip_fraction        | 0.151       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.25       |
|    explained_variance   | 0.463       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.4         |
|    n_updates            | 970         |
|    policy_gradient_loss | -0.0405     |
|    value_loss           | 3.

In [18]:
import pandas as pd
import numpy as np
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback

# Load datasets
tasks_df = pd.read_csv('RandomTasks400.csv')


# Rename columns for consistency
tasks_df.rename(columns={
    'Required_RAM': 'RAM',
    'Required_Storage': 'storage',
    'Minimum_Trust_Factor': 'Trustfactor',
    'Max_Distance': 'Distance',
    'Min_Transmission_Rate': 'TransmissionRate',
    'Min_Eligibility': 'MinEligibility'
}, inplace=True)

# Define the Gym environment for task allocation
class TaskAllocationEnv(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self, vehicles, tasks):
        super(TaskAllocationEnv, self).__init__()
        self.vehicles = vehicles
        self.tasks = tasks
        self.current_task = 0
        self.successful_assignments = 0
        self.successful_history = []  # Added to track successful assignments
        self.seed()

        num_features = tasks.shape[1]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(num_features,), dtype=np.float32)
        self.eligible_vehicle_indices = []
        self.update_action_space()

    def seed(self, seed=42):
        self.np_random, seed = gym.utils.seeding.np_random(seed)
        return [seed]        
        
    def update_action_space(self):
        task_eligibility = self.tasks.iloc[self.current_task]['MinEligibility']
        eligible_vehicles = self.vehicles[self.vehicles['Eligible'] >= task_eligibility]
        self.eligible_vehicle_indices = eligible_vehicles.index.tolist()
        if len(self.eligible_vehicle_indices) == 0:
            self.action_space = spaces.Discrete(1)  # Prevents invalid action space of size 0
        else:
            self.action_space = spaces.Discrete(len(self.eligible_vehicle_indices))

    def reset(self):
        self.current_task = 0
        self.successful_assignments = 0
        self.update_action_space()
        #print("Resetting environment. Starting new episode.")
        return self.tasks.iloc[self.current_task].values.astype(np.float32)

    def step(self, action):
        task = self.tasks.iloc[self.current_task]
        vehicle = self.vehicles.iloc[action]
        reward = 0

        # Check if the vehicle meets all the task requirements
        meets_requirements = (
            vehicle['RAM'] >= task['RAM'] and
            vehicle['storage'] >= task['storage'] and
            vehicle['Trustfactor'] >= task['Trustfactor'] and
            vehicle['Distance'] <= task['Distance'] and
            vehicle['TransmissionRate'] >= task['TransmissionRate']
        )
        reward = 1 if meets_requirements else -1

        self.successful_assignments += reward > 0
        self.current_task += 1
        done = self.current_task >= len(self.tasks)

        if not done:
            self.update_action_space()
            next_state = self.tasks.iloc[self.current_task].values.astype(np.float32)
        else:
            next_state = np.zeros(self.observation_space.shape[0])
            self.successful_history.append(self.successful_assignments)
            #print(f"Episode completed. Successful assignments: {self.successful_assignments}.")
            self.successful_assignments = 0  # Reset for next episode

        
        
        #three lines below can be uncommented for more detailed output
        #print(f"Task Details: {task.to_dict()}")
        #print(f"Vehicle Details: {vehicle.to_dict()}")
        #print(f"Step: Task {self.current_task}, Action {action}, Reward {reward}")

        return next_state, reward, done, {}

    def get_average_success(self):
        return np.mean(self.successful_history) if self.successful_history else 0


    def render(self, mode='human'):
        pass

    def close(self):
        pass



# Custom callback for logging
class CustomCallback(BaseCallback):
    def __init__(self, env, verbose=0):
        super(CustomCallback, self).__init__(verbose)
        self.env = env
        self.total_rewards = 0
        self.total_assignments = 0
        self.num_episodes = 0

    def _on_step(self):
        return True

    def _on_rollout_end(self):
        mean_reward, std_reward = evaluate_policy(self.model, self.model.get_env(), n_eval_episodes=10)
        average_assignments = self.env.get_attr('get_average_success')[0]()
        self.total_rewards += mean_reward
        self.total_assignments += average_assignments
        self.num_episodes += 1

        print("-------- Rollout Summary --------")
        print(f"Total mean reward: {mean_reward}")
        print(f"Standard deviation of reward: {std_reward}")
        print(f"Average successful assignments: {average_assignments}")
        print("All assignments history:", self.env.envs[0].successful_history)
        self.env.envs[0].successful_history = []  # Reset history after each iteration

    def _on_training_end(self):
        average_total_reward = self.total_rewards / self.num_episodes
        average_total_assignments = self.total_assignments / self.num_episodes
        print("-------- Training Summary --------")
        print(f"Overall Average Mean Reward: {average_total_reward}")
        print(f"Overall Average Successful Assignments: {average_total_assignments}")


# Prepare the environment
env = make_vec_env(lambda: TaskAllocationEnv(vehicles_df, tasks_df), n_envs=1)

# Initialize and train the PPO model
model = PPO("MlpPolicy", env, verbose=1,
            n_steps=1024, batch_size=128, n_epochs=10, learning_rate=0.00018,
            gamma=0.96, gae_lambda=0.87, clip_range=0.15, ent_coef=0.07)

callback = CustomCallback(env)  # Use custom callback for detailed tracking and logging

# Train the model with the custom callback
model.learn(total_timesteps=1024*100, callback=callback)

# Save the model
model.save("ppo_task_allocation_model")


Using cpu device




-------- Rollout Summary --------
Total mean reward: -370.0
Standard deviation of reward: 0.0
Average successful assignments: 15.0
All assignments history: [18, 12, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 400      |
|    ep_rew_mean     | -370     |
| time/              |          |
|    fps             | 245      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 1024     |
---------------------------------


  logger.warn(
  logger.warn(


-------- Rollout Summary --------
Total mean reward: -318.0
Standard deviation of reward: 0.0
Average successful assignments: 25.625
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -372        |
| time/                   |             |
|    fps                  | 241         |
|    iterations           | 2           |
|    time_elapsed         | 8           |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.007885432 |
|    clip_fraction        | 0.0967      |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.87       |
|    explained_variance   | -0.247      |
|    learning_rate        | 0.00018     |
|    loss                 | 2.43        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0404     |
|    value_loss           | 17.2        |

-------- Rollout Summary --------
Total mean reward: -8.0
Standard deviation of reward: 0.0
Average successful assignments: 102.075
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -367        |
| time/                   |             |
|    fps                  | 236         |
|    iterations           | 10          |
|    time_elapsed         | 43          |
|    total_timesteps      | 10240       |
| train/                  |             |
|    approx_kl            | 0.010379191 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.8        |
|    explained_variance   | 0.00504     |
|    learning_rate        | 0.00018     |
|    loss                 | 0.435       |
|    n_updates            | 90          |
|    policy_gradient_loss | -0.0419     |
|    value_loss           | 4.01        |


-------- Rollout Summary --------
Total mean reward: 46.0
Standard deviation of reward: 0.0
Average successful assignments: 137.8148148148148
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -357        |
| time/                   |             |
|    fps                  | 234         |
|    iterations           | 18          |
|    time_elapsed         | 78          |
|    total_timesteps      | 18432       |
| train/                  |             |
|    approx_kl            | 0.010905644 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.15        |
|    entropy_loss         | -5.69       |
|    explained_variance   | 0.236       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.695       |
|    n_updates            | 170         |
|    policy_gradient_loss | -0.0461     |
|    value_loss           | 2.65

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 156.86858974358975
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -343         |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 26           |
|    time_elapsed         | 113          |
|    total_timesteps      | 26624        |
| train/                  |              |
|    approx_kl            | 0.0071554002 |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.15         |
|    entropy_loss         | -5.37        |
|    explained_variance   | 0.405        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.693        |
|    n_updates            | 250          |
|    policy_gradient_loss | -0.0382      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 64.0
Standard deviation of reward: 0.0
Average successful assignments: 168.9950980392157
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -318        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 34          |
|    time_elapsed         | 148         |
|    total_timesteps      | 34816       |
| train/                  |             |
|    approx_kl            | 0.007936343 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.75       |
|    explained_variance   | 0.661       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.27        |
|    n_updates            | 330         |
|    policy_gradient_loss | -0.0437     |
|    value_loss           | 2.89

-------- Rollout Summary --------
Total mean reward: 94.0
Standard deviation of reward: 0.0
Average successful assignments: 178.7718253968254
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -289        |
| time/                   |             |
|    fps                  | 233         |
|    iterations           | 42          |
|    time_elapsed         | 184         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.008980411 |
|    clip_fraction        | 0.158       |
|    clip_range           | 0.15        |
|    entropy_loss         | -4.21       |
|    explained_variance   | 0.658       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.1         |
|    n_updates            | 410         |
|    policy_gradient_loss | -0.0425     |
|    value_loss           | 2.75

-------- Rollout Summary --------
Total mean reward: 120.0
Standard deviation of reward: 0.0
Average successful assignments: 187.92666666666668
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -261        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 50          |
|    time_elapsed         | 219         |
|    total_timesteps      | 51200       |
| train/                  |             |
|    approx_kl            | 0.007902365 |
|    clip_fraction        | 0.149       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.76       |
|    explained_variance   | 0.633       |
|    learning_rate        | 0.00018     |
|    loss                 | 0.967       |
|    n_updates            | 490         |
|    policy_gradient_loss | -0.0432     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 122.0
Standard deviation of reward: 0.0
Average successful assignments: 195.71120689655172
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -213         |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 58           |
|    time_elapsed         | 255          |
|    total_timesteps      | 59392        |
| train/                  |              |
|    approx_kl            | 0.0065330206 |
|    clip_fraction        | 0.122        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.49        |
|    explained_variance   | 0.649        |
|    learning_rate        | 0.00018      |
|    loss                 | 1            |
|    n_updates            | 570          |
|    policy_gradient_loss | -0.0403      |
|    value_

-------- Rollout Summary --------
Total mean reward: 146.0
Standard deviation of reward: 0.0
Average successful assignments: 202.70833333333334
All assignments history: []
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 400        |
|    ep_rew_mean          | -163       |
| time/                   |            |
|    fps                  | 232        |
|    iterations           | 66         |
|    time_elapsed         | 290        |
|    total_timesteps      | 67584      |
| train/                  |            |
|    approx_kl            | 0.00836736 |
|    clip_fraction        | 0.147      |
|    clip_range           | 0.15       |
|    entropy_loss         | -3.28      |
|    explained_variance   | 0.529      |
|    learning_rate        | 0.00018    |
|    loss                 | 1.05       |
|    n_updates            | 650        |
|    policy_gradient_loss | -0.0407    |
|    value_loss           | 2.87       |
--------

-------- Rollout Summary --------
Total mean reward: 152.0
Standard deviation of reward: 0.0
Average successful assignments: 208.91103603603602
All assignments history: []
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 400         |
|    ep_rew_mean          | -116        |
| time/                   |             |
|    fps                  | 232         |
|    iterations           | 74          |
|    time_elapsed         | 326         |
|    total_timesteps      | 75776       |
| train/                  |             |
|    approx_kl            | 0.007032457 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.15        |
|    entropy_loss         | -3.27       |
|    explained_variance   | 0.386       |
|    learning_rate        | 0.00018     |
|    loss                 | 1.23        |
|    n_updates            | 730         |
|    policy_gradient_loss | -0.0366     |
|    value_loss           | 3.

-------- Rollout Summary --------
Total mean reward: 160.0
Standard deviation of reward: 0.0
Average successful assignments: 214.1991869918699
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -78.8        |
| time/                   |              |
|    fps                  | 232          |
|    iterations           | 82           |
|    time_elapsed         | 361          |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0067794286 |
|    clip_fraction        | 0.139        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.11        |
|    explained_variance   | 0.344        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.08         |
|    n_updates            | 810          |
|    policy_gradient_loss | -0.0435      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 162.0
Standard deviation of reward: 0.0
Average successful assignments: 218.7962962962963
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -50.3        |
| time/                   |              |
|    fps                  | 234          |
|    iterations           | 90           |
|    time_elapsed         | 393          |
|    total_timesteps      | 92160        |
| train/                  |              |
|    approx_kl            | 0.0060931873 |
|    clip_fraction        | 0.132        |
|    clip_range           | 0.15         |
|    entropy_loss         | -3.03        |
|    explained_variance   | 0.348        |
|    learning_rate        | 0.00018      |
|    loss                 | 0.939        |
|    n_updates            | 890          |
|    policy_gradient_loss | -0.0386      |
|    value_l

-------- Rollout Summary --------
Total mean reward: 158.0
Standard deviation of reward: 0.0
Average successful assignments: 222.718537414966
All assignments history: []
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 400          |
|    ep_rew_mean          | -28.8        |
| time/                   |              |
|    fps                  | 235          |
|    iterations           | 98           |
|    time_elapsed         | 425          |
|    total_timesteps      | 100352       |
| train/                  |              |
|    approx_kl            | 0.0068844287 |
|    clip_fraction        | 0.129        |
|    clip_range           | 0.15         |
|    entropy_loss         | -2.95        |
|    explained_variance   | 0.439        |
|    learning_rate        | 0.00018      |
|    loss                 | 1.19         |
|    n_updates            | 970          |
|    policy_gradient_loss | -0.0351      |
|    value_lo