<a href="https://colab.research.google.com/github/alirezakavianifar/gitTutorial/blob/developer/Copy_of_RLProject3_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q gymnasium stable_baselines3

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.logger import configure
import numpy as np
import os

class HealthcareNetworkEnv(gym.Env):
    def __init__(self, H, P, R, T, LeadTime, transport_costs, transshipment_costs, inventory_costs, ordering_costs, coverage_distance, hospital_distances):
        super(HealthcareNetworkEnv, self).__init__()

        self.H = H
        self.P = P
        self.R = R
        self.T = T
        self.LeadTime = LeadTime

        self.transport_costs = transport_costs
        self.transshipment_costs = transshipment_costs
        self.inventory_costs = inventory_costs
        self.ordering_costs = ordering_costs

        self.coverage_distance = coverage_distance
        self.hospital_distances = hospital_distances

        self.observation_space = spaces.Dict({
            'inventory': spaces.Box(low=0, high=np.inf, shape=(H, P), dtype=np.float32),
            'demand': spaces.Box(low=0, high=np.inf, shape=(H, P), dtype=np.float32),
            'supply_capacity': spaces.Box(low=0, high=np.inf, shape=(H, P), dtype=np.float32),
            'lead_time': spaces.Box(low=0, high=np.inf, shape=(P,), dtype=np.float32)
        })

        self.action_space = spaces.MultiDiscrete([10] * (H * P * R + H * H * P))
        self.state = self.reset()

    def reset(self, **kwargs):
        self.state = {
            'inventory': np.zeros((self.H, self.P), dtype=np.float32),
            'demand': np.random.randint(0, 10, size=(self.H, self.P)).astype(np.float32),
            'supply_capacity': np.ones((self.H, self.P), dtype=np.float32),
            'lead_time': self.LeadTime.astype(np.float32)
        }
        self.orders_in_transit = []
        self.current_time = 0
        return self.state, {}

    def step(self, action):
        order_action_size = self.H * self.P * self.R
        order = np.array(action[:order_action_size]).reshape((self.H, self.P, self.R))
        transship = np.array(action[order_action_size:]).reshape((self.H, self.H, self.P))

        self._update_inventory(order, transship)
        reward, demand_loss, costs = self._calculate_reward(order, transship)
        self._update_demand()
        next_state = self.state

        done = self.current_time >= self.T
        truncated = False  # We are not using truncation, so set it to False
        self.current_time += 1

        return next_state, reward, done, truncated, {}

    def _update_inventory(self, order, transship):
        new_orders_in_transit = []
        for order_info in self.orders_in_transit:
            arrival_time, h, p, quantity = order_info
            if self.current_time >= arrival_time:
                self.state['inventory'][h, p] += quantity
            else:
                new_orders_in_transit.append(order_info)
        self.orders_in_transit = new_orders_in_transit

        for h in range(self.H):
            for p in range(self.P):
                for r in range(self.R):
                    supply_received = order[h, p, r]
                    capacity_available = self.state['supply_capacity'][h, p]
                    supply_received = min(supply_received, capacity_available)
                    arrival_time = self.current_time + self.LeadTime[p]
                    self.orders_in_transit.append((arrival_time, h, p, supply_received))

        for h1 in range(self.H):
            for h2 in range(self.H):
                if h1 != h2 and self.hospital_distances[h1, h2] <= self.coverage_distance:
                    for p in range(self.P):
                        if transship[h1, h2, p] > 0:
                            transfer_quantity = min(transship[h1, h2, p], self.state['inventory'][h1, p])
                            self.state['inventory'][h1, p] -= transfer_quantity
                            self.state['inventory'][h2, p] += transfer_quantity

        self.state['inventory'] = np.maximum(self.state['inventory'], 0)

    def _calculate_reward(self, order, transship):
        reward = 0
        demand_loss = 0
        total_costs = 0

        epsilon_p = 0.01

        transport_cost = 0
        transshipment_cost = 0
        inventory_cost = 0
        ordering_cost = 0
        shortage_cost = 0

        for h in range(self.H):
            for p in range(self.P):
                for r in range(self.R):
                    ordered_quantity = order[h, p, r]
                    received_quantity = self.state['inventory'][h, p]

                    if received_quantity < ordered_quantity * (1 + epsilon_p):
                        demand_loss += (ordered_quantity * (1 + epsilon_p) - received_quantity)
                    elif received_quantity > ordered_quantity * (1 + epsilon_p):
                        demand_loss += (received_quantity - ordered_quantity * (1 + epsilon_p))
                    else:
                        reward += received_quantity

                    transport_cost += ordered_quantity * self.transport_costs[r, h, p]

                inventory_cost += self.state['inventory'][h, p] * self.inventory_costs[h, p]
                shortage = max(self.state['demand'][h, p] - self.state['inventory'][h, p], 0)
                shortage_cost += shortage

        for h1 in range(self.H):
            for h2 in range(self.H):
                if h1 != h2 and self.hospital_distances[h1, h2] <= self.coverage_distance:
                    for p in range(self.P):
                        transshipment_quantity = transship[h1, h2, p]
                        transshipment_cost += transshipment_quantity * self.transshipment_costs[h1, h2, p]

        for h in range(self.H):
            for p in range(self.P):
                ordering_cost += np.sum(order[h, p, :]) * self.ordering_costs[p, h]

        total_costs = transport_cost + transshipment_cost + inventory_cost + ordering_cost + shortage_cost
        reward = -total_costs

        return reward, demand_loss, total_costs

    def _update_demand(self):
        self.state['demand'] = np.random.randint(0, 10, size=(self.H, self.P)).astype(np.float32)

# Define parameters
H = 5
P = 3
R = 1
T = 10
LeadTime = np.array([1, 2, 3])

transport_costs = np.random.rand(R, H, P)
transshipment_costs = np.random.rand(H, H, P)
inventory_costs = np.random.rand(H, P)
ordering_costs = np.random.rand(P, H)
coverage_distance = 5.0
hospital_distances = np.random.rand(H, H) * 10

env = HealthcareNetworkEnv(H, P, R, T, LeadTime, transport_costs, transshipment_costs, inventory_costs, ordering_costs, coverage_distance, hospital_distances)

check_env(env)




In [None]:
env = DummyVecEnv([lambda: Monitor(env)])

log_dir = "./logs/"
os.makedirs(log_dir, exist_ok=True)
logger = configure(log_dir, ["stdout", "csv", "tensorboard"])

eval_env = DummyVecEnv([lambda: Monitor(HealthcareNetworkEnv(H, P, R, T, LeadTime, transport_costs, transshipment_costs, inventory_costs, ordering_costs, coverage_distance, hospital_distances))])
eval_callback = EvalCallback(eval_env, best_model_save_path="./logs/best_model", log_path="./logs/results", eval_freq=500, deterministic=True, render=False)

model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir)
model.learn(total_timesteps=10000, callback=eval_callback)

model.save("ppo_healthcare_network")

# To load the trained model and evaluate
model = PPO.load("ppo_healthcare_network")

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, truncated, info = env.step(action)
    if dones:
        obs = env.reset()

print("Evaluation completed.")

Logging to ./logs/
Using cuda device
Logging to ./logs/PPO_1
Eval num_timesteps=500, episode_reward=-2160.43 +/- 49.38
Episode length: 11.00 +/- 0.00
----------------------------------
| eval/              |           |
|    mean_ep_length  | 11        |
|    mean_reward     | -2.16e+03 |
| time/              |           |
|    total_timesteps | 500       |
----------------------------------
New best mean reward!
Eval num_timesteps=1000, episode_reward=-2169.02 +/- 18.51
Episode length: 11.00 +/- 0.00
----------------------------------
| eval/              |           |
|    mean_ep_length  | 11        |
|    mean_reward     | -2.17e+03 |
| time/              |           |
|    total_timesteps | 1000      |
----------------------------------
Eval num_timesteps=1500, episode_reward=-2158.79 +/- 41.85
Episode length: 11.00 +/- 0.00
----------------------------------
| eval/              |           |
|    mean_ep_length  | 11        |
|    mean_reward     | -2.16e+03 |
| time/           