In [1]:
pip install numpy gym torch stable-baselines3


Collecting stable-baselines3
  Downloading stable_baselines3-2.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (

In [8]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gym import spaces
from stable_baselines3 import PPO
# Import DummyVecEnv from the correct location
from stable_baselines3.common.vec_env import DummyVecEnv
from collections import deque
import random

# Custom FANET Environment
class FANETRoutingEnv(gym.Env):
    def __init__(self, num_uavs=5):
        super(FANETRoutingEnv, self).__init__()

        # Environment parameters
        self.num_uavs = num_uavs
        self.max_steps = 100
        self.current_step = 0

        # Action & State Space
        self.action_space = spaces.Discrete(self.num_uavs)  # Choose next UAV for routing
        self.observation_space = spaces.Box(low=0, high=1, shape=(self.num_uavs, 3), dtype=np.float32)

        # UAV State [Link Quality, Energy Level, Queue Size]
        self.state = np.random.rand(self.num_uavs, 3)

    def reset(self):
        """ Reset the environment at the beginning of an episode """
        self.current_step = 0
        self.state = np.random.rand(self.num_uavs, 3)  # Reinitialize UAV states
        # Return the state without flattening
        return self.state

    def step(self, action):
        """ Execute a routing action (selecting a UAV) and return new state, reward, done flag """
        self.current_step += 1

        # Reward function: Encourage good link quality & energy efficiency
        link_quality = self.state[action][0]
        energy = self.state[action][1]
        queue_size = self.state[action][2]

        reward = (link_quality * 10) - (queue_size * 2) - ((1 - energy) * 5)

        # Update state: Simulate dynamic changes in FANET
        self.state = np.random.rand(self.num_uavs, 3)
        done = self.current_step >= self.max_steps

        # Return the state without flattening
        return self.state, reward, done, {}

    def render(self, mode='human'):
        pass

In [12]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim * 3, 64)  # Modified input dimension to accommodate state features
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the state before passing to fc1
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, input_dim, output_dim):
        self.model = DQN(input_dim, output_dim)
        self.target_model = DQN(input_dim, output_dim)
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
        self.memory = deque(maxlen=1000)
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01

    def select_action(self, state):
        """ Choose action based on epsilon-greedy policy """
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.model.fc3.out_features)
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        return torch.argmax(self.model(state)).item()

    def train(self, batch_size=32):
        """ Train the DQN model using experience replay """
        if len(self.memory) < batch_size:
            return
        batch = random.sample(self.memory, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.int64)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.float32)

        q_values = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_q_values = self.target_model(next_states).max(1)[0]
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        loss = nn.MSELoss()(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def store_experience(self, state, action, reward, next_state, done):
        """ Store experience in replay buffer """
        self.memory.append((state, action, reward, next_state, done))


In [13]:
!pip install shimmy>=2.0


In [14]:
env = DummyVecEnv([lambda: FANETRoutingEnv(num_uavs=5)])
ppo_model = PPO("MlpPolicy", env, verbose=1)

print("Training PPO Model...")
ppo_model.learn(total_timesteps=5000)
print("PPO Training Completed.")




Using cpu device
Training PPO Model...
-----------------------------
| time/              |      |
|    fps             | 301  |
|    iterations      | 1    |
|    time_elapsed    | 6    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 281          |
|    iterations           | 2            |
|    time_elapsed         | 14           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0064895884 |
|    clip_fraction        | 0.0193       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.61        |
|    explained_variance   | 0.00236      |
|    learning_rate        | 0.0003       |
|    loss                 | 102          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.013       |
|    value_loss           | 310          |
--------------------

In [15]:
# Initialize Environment & DQN Agent
env = FANETRoutingEnv(num_uavs=5)
dqn_agent = DQNAgent(input_dim=env.observation_space.shape[0], output_dim=env.action_space.n)

num_episodes = 100

for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        action = dqn_agent.select_action(state)  # DQN selects best routing link
        next_state, reward, done, _ = env.step(action)

        dqn_agent.store_experience(state, action, reward, next_state, done)
        dqn_agent.train()
        state = next_state
        total_reward += reward

    print(f"Episode {episode+1}, Total Reward: {total_reward}")

# PPO Deployment
print("Deploying PPO for UAV Path Optimization...")
ppo_obs = env.reset()
for _ in range(10):
    action, _ = ppo_model.predict(ppo_obs)
    ppo_obs, _, _, _ = env.step(action)


Episode 1, Total Reward: 101.09856717679705
Episode 2, Total Reward: 185.96752300012176
Episode 3, Total Reward: 131.7517013427119
Episode 4, Total Reward: 154.69471538676382
Episode 5, Total Reward: 113.30963082406109
Episode 6, Total Reward: 189.04222462108527
Episode 7, Total Reward: 99.84993036926137
Episode 8, Total Reward: 134.12978487585576
Episode 9, Total Reward: 182.90377795875116
Episode 10, Total Reward: 159.6269745114348
Episode 11, Total Reward: 131.2391268291912
Episode 12, Total Reward: 167.7906883083219
Episode 13, Total Reward: 160.53533137801497
Episode 14, Total Reward: 178.46345729285454
Episode 15, Total Reward: 174.60055440529612
Episode 16, Total Reward: 176.84971725964974
Episode 17, Total Reward: 153.28844253250435
Episode 18, Total Reward: 181.1483172303795
Episode 19, Total Reward: 142.71605183653773
Episode 20, Total Reward: 129.61557026660157
Episode 21, Total Reward: 150.1934992662397
Episode 22, Total Reward: 156.92635000745275
Episode 23, Total Reward: 