In [1]:
#multi-agent testrun using pettingzoo
#still need to update the actionset
#still need to update reward function
#trial to see if it works

#update the actions so that it selects the frequency range of a particular radar first, and then picks a sub-band from inside it
#fix the frequency hopping of the radar so that it only transmits one frequency per radar per step.

#after finalizing actions and rewards, test on vanilla dqn and sstart implementing ddqn with per

#fixed the radar frquency issue, need to fix the states, obs, info etc tuple being returned by env to the dqn
#after that, train dqn
#finally move to ddqn with per

#created the ddqn with per, need to add multiple radars and finalize
#rn its trying to lock all jammers on to a single radar, which doesnt make sense and is the reason for no convergence

#multiple jammers added, need to max scale rewards between 0 and 1
#get loss curves, success rate of jammers for any target, success rate of blocking each target
#assign threat coefficients based on the how far away the transmitted frequency is from the jammers frequency
#add frequency diversity

In [2]:
import numpy as np
import math
import gymnasium as gym
from gym import Env
from gym import spaces
import random
import numpy as np
from IPython.display import clear_output
import os
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from pettingzoo import ParallelEnv

is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

#device setup
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

In [3]:
freq_agile_radar_versions = [[9 * math.pow(10, 9), 9.4* math.pow(10, 9)], [8 * math.pow(10, 9), 8.5* math.pow(10, 9)], [6 * math.pow(10, 9), 6.2* math.pow(10, 9)]]

hop_states = []

for i in range (0,3):
    radar_states = []
    interval = (freq_agile_radar_versions[i][1] - freq_agile_radar_versions[i][0])/10
    for j in range(0,10):
        radar_states.append(int(freq_agile_radar_versions[i][0]+ interval * j))
    hop_states.append(radar_states)

print(hop_states)


[[9000000000, 9040000000, 9080000000, 9120000000, 9160000000, 9200000000, 9240000000, 9280000000, 9320000000, 9360000000], [8000000000, 8050000000, 8100000000, 8150000000, 8200000000, 8250000000, 8300000000, 8350000000, 8400000000, 8450000000], [6000000000, 6020000000, 6040000000, 6060000000, 6080000000, 6100000000, 6120000000, 6140000000, 6160000000, 6180000000]]


In [4]:
#hop_states = np.linspace(9e9, 9.4e9, 20e6)
jamming_bandwidths = [10e6, 20e6, 30e6]

In [5]:
class ParallelRadarJammingEnv(ParallelEnv):
    def __init__(self, hop_states, jamming_bandwidths, max_hop_length=10):
        super().__init__()

        self.hop_states = hop_states
        self.max_hop_length = max_hop_length

        #params for each hop state list
        self.low = [hop[0] for hop in self.hop_states]
        self.interval = [hop[1] - hop[0] for hop in self.hop_states]
        self.n_frequencies = [len(hop) for hop in self.hop_states]

        #agents
        self.agents = ["jammer_0", "jammer_1", "jammer_2"]
        self.possible_agents = self.agents[:]
        self.agent_name_mapping = {agent: i for i, agent in enumerate(self.agents)}

        #action space
        self.jamming_bandwidths = jamming_bandwidths
        self.n_bandwidths = len(self.jamming_bandwidths)
        self.action_space = {agent: spaces.Discrete(self.max_hop_length * self.n_bandwidths) for agent in self.agents}

        #observation space
        self.observation_spaces = {
            agent: spaces.Box(low=0, high=1, shape=(sum(self.n_frequencies),), dtype=np.float32) for agent in self.agents
        }

        self.hopping_patterns = None
        self.current_frequencies = None
        self.current_steps = None
        self.cumulative_rewards = None
        self.net_agent_rewards = None
        self.terminations = None
        self.rewards = None
        self.info = None

    def index_to_action(self, index):
        
        #action index to freq + bandwidth
        frequency_index = index // self.n_bandwidths
        bandwidth_index = index % self.n_bandwidths
        hop_index = frequency_index % len(self.hop_states)
        frequency = int(self.low[hop_index] + (frequency_index // len(self.hop_states)) * self.interval[hop_index])
        bandwidth = self.jamming_bandwidths[bandwidth_index]
        return frequency, bandwidth

    def generate_hopping_pattern(self):
        return [np.random.choice(hop, self.max_hop_length, replace=False) for hop in self.hop_states]

    def reset(self):

        self.hopping_patterns = self.generate_hopping_pattern()
        self.radar_frequencies = [pattern[0] for pattern in self.hopping_patterns]

        self.current_steps = {agent: 0 for agent in self.agents}
        self.cumulative_rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.rewards = {agent: 0 for agent in self.agents}
        self.info = {agent: {} for agent in self.agents}

        observations = {agent: self.observe(agent) for agent in self.agents}
        return observations

    def observe(self, agent):
        
        #observation is a one-hot encoded vector representing the radar frequencies across all hop_states
        observation = np.zeros(sum(self.n_frequencies))
        offset = 0
        for i, frequencies in enumerate(self.hop_states):
            freq_index = np.where(np.array(frequencies) == self.radar_frequencies[i])[0][0]
            observation[offset + freq_index] = 1
            offset += len(frequencies)
        return observation

    def step(self, actions):
        total_jammed_frequencies = 0
        self.net_agent_rewards = 0

        for agent, action in actions.items():
            frequency, bandwidth = self.index_to_action(action)

            lower_bound = frequency - bandwidth
            upper_bound = frequency + bandwidth

            jammed = False
            for radar_frequency in self.radar_frequencies:
                if lower_bound <= radar_frequency <= upper_bound:
                    jammed = True
                    break
            #if jammed the reward is 20 * (the element of hop_states that the frequency is found in + 1)

            if jammed:
                reward = 30 * ([np.where(np.array(hop) == frequency)[0][0] for hop in self.hop_states if frequency in hop][0] + 1)
                total_jammed_frequencies += 1
            else:
                reward = -50
                
            hop_index = np.argmin([abs(frequency - radar) for radar in self.radar_frequencies])

            if 2 * (bandwidth / self.interval[hop_index]) > 5:
                reward -= 1

            jam_threshold = len(self.hop_states) / len(self.agents)
            if total_jammed_frequencies > jam_threshold:
                reward += 5
            else:
                reward -= 5

            # Update rewards and steps
            self.current_steps[agent] += 1
            self.cumulative_rewards[agent] += reward
            self.net_agent_rewards += reward

            # Termination condition after 100 steps
            if self.current_steps[agent] >= 200:
                self.terminations[agent] = True

            self.rewards[agent] = reward

        # Update radar frequencies for the next step
        self.radar_frequencies = [pattern[self.current_steps[agent] % self.max_hop_length] for pattern in self.hopping_patterns]

        # Return observations, rewards, done status, and info for each agent
        observations = {agent: self.observe(agent) for agent in self.agents}
        rewards = {agent: self.rewards[agent] for agent in self.agents}
        done = {agent: self.terminations[agent] for agent in self.agents}
        info = {agent: self.info[agent] for agent in self.agents}

        return observations, rewards, done, info

    def render(self):
        for agent in self.agents:
            print(f"Agent {agent}, Step: {self.current_steps[agent]}, Radar Frequencies: {self.radar_frequencies}")


In [6]:
env = ParallelRadarJammingEnv(hop_states, jamming_bandwidths)

observations = env.reset()
done = {agent: False for agent in env.agents}
cumulative_reward = {agent: 0 for agent in env.agents}

while not all(done.values()):

    actions = {agent: env.action_space[agent].sample() for agent in env.agents}
    observations, rewards, done, infos = env.step(actions)

    for agent in env.agents:
        cumulative_reward[agent] += rewards[agent]

    env.render()
    print(f"Actions: {actions}, Rewards: {rewards}, Cumulative Rewards: {cumulative_reward}")
    print('\n')


Agent jammer_0, Step: 1, Radar Frequencies: [9320000000, 8100000000, 6160000000]
Agent jammer_1, Step: 1, Radar Frequencies: [9320000000, 8100000000, 6160000000]
Agent jammer_2, Step: 1, Radar Frequencies: [9320000000, 8100000000, 6160000000]
Actions: {'jammer_0': 13, 'jammer_1': 4, 'jammer_2': 7}, Rewards: {'jammer_0': -55, 'jammer_1': -55, 'jammer_2': 25}, Cumulative Rewards: {'jammer_0': -55, 'jammer_1': -55, 'jammer_2': 25}


Agent jammer_0, Step: 2, Radar Frequencies: [9080000000, 8400000000, 6140000000]
Agent jammer_1, Step: 2, Radar Frequencies: [9080000000, 8400000000, 6140000000]
Agent jammer_2, Step: 2, Radar Frequencies: [9080000000, 8400000000, 6140000000]
Actions: {'jammer_0': 20, 'jammer_1': 16, 'jammer_2': 19}, Rewards: {'jammer_0': -55, 'jammer_1': -55, 'jammer_2': -55}, Cumulative Rewards: {'jammer_0': -110, 'jammer_1': -110, 'jammer_2': -30}


Agent jammer_0, Step: 3, Radar Frequencies: [9200000000, 8200000000, 6080000000]
Agent jammer_1, Step: 3, Radar Frequencies: [

In [7]:
#setting up replay memory

Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward')
)

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

#dqn definition
class DQN(nn.Module):
    def __init__(self, obs_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(obs_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return self.fc4(x)

In [9]:
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.buffer = []
        self.priorities = np.zeros((capacity,), dtype=np.float32)
        self.capacity = capacity
        self.pos = 0
        self.alpha = alpha

    def add(self, state, action, reward, next_state, done):
        max_priority = self.priorities.max() if self.buffer else 1.0
        if len(self.buffer) < self.capacity:
            self.buffer.append((state, action, reward, next_state, done))
        else:
            self.buffer[self.pos] = (state, action, reward, next_state, done)

        self.priorities[self.pos] = max_priority
        self.pos = (self.pos + 1) % self.capacity

    def sample(self, batch_size, beta=0.4):
        if len(self.buffer) == self.capacity:
            priorities = self.priorities
        else:
            priorities = self.priorities[:self.pos]

        probabilities = priorities ** self.alpha
        probabilities /= probabilities.sum()

        indices = np.random.choice(len(self.buffer), batch_size, p=probabilities)
        samples = [self.buffer[idx] for idx in indices]

        total = len(self.buffer)
        weights = (total * probabilities[indices]) ** (-beta)
        weights /= weights.max()

        states, actions, rewards, next_states, dones = zip(*samples)
        return (np.array(states), np.array(actions), np.array(rewards),
                np.array(next_states), np.array(dones), indices, np.array(weights))

    def update_priorities(self, batch_indices, batch_priorities):
        for idx, priority in zip(batch_indices, batch_priorities):
            self.priorities[idx] = priority

    def size(self):
        return len(self.buffer)

In [10]:
def update_ddqn(shared_dqn, target_dqn, optimizer, replay_buffer, batch_size, gamma, beta):
    if replay_buffer.size() < batch_size:
        return
    
    #sample from prioritized replay buffer
    states, actions, rewards, next_states, dones, indices, weights = replay_buffer.sample(batch_size, beta)
    
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards).unsqueeze(1)
    next_states = torch.FloatTensor(next_states)
    dones = torch.FloatTensor(dones).unsqueeze(1)
    weights = torch.FloatTensor(weights).unsqueeze(1)

    #current q-values
    q_values = shared_dqn(states).gather(1, actions)

    #ddqn update rule
    next_actions = shared_dqn(next_states).argmax(1, keepdim=True)
    next_q_values = target_dqn(next_states).gather(1, next_actions)

    target_q_values = rewards + gamma * next_q_values * (1 - dones)

    #weighted MSE loss with importance sampling correction
    td_errors = target_q_values - q_values
    loss = (weights * td_errors ** 2).mean()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    #update priorities based on new TD errors
    new_priorities = td_errors.abs().detach().numpy() + 1e-5
    replay_buffer.update_priorities(indices, new_priorities)


In [11]:
import numpy as np
import torch
import matplotlib.pyplot as plt

#select action
def select_action(state, epsilon, action_size, model):
    if np.random.rand() < epsilon:
        return np.random.randint(action_size) 
    else:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = model(state_tensor)
            return q_values.argmax().item()

#update target fn
def update_target(main_dqn, target_dqn):
    target_dqn.load_state_dict(main_dqn.state_dict())

#plot rewards
# def plot_rewards(rewards, mean_rewards):
#     plt.ion
#     plt.plot(rewards, color='blue', label='Total Rewards')
#     plt.plot(mean_rewards, color='orange', label='Mean Rewards (every 100 episodes)')    
#     plt.xlim(0, len(rewards))
#     plt.ylim(min(rewards), max(rewards) + 10)
#     plt.title('Total Rewards Over Episodes')
#     plt.xlabel('Episode')
#     plt.ylabel('Total Reward')
#     plt.legend()
#     plt.grid(True)
#     plt.pause(0.1)  # Pause to allow the plot to update


def train_ddqn_per(env, num_episodes, batch_size, gamma=0.99, epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995, 
                   beta_start=0.4, beta_end=1.0, beta_increment_per_episode=0.001, target_update_freq=10):

    agent_id = env.agents[0]

    obs_size = env.observation_space(agent_id).shape[0]
    action_size = env.action_space[agent_id].n

    shared_dqn = DQN(obs_size, action_size)
    target_dqn = DQN(obs_size, action_size)
    target_dqn.load_state_dict(shared_dqn.state_dict())

    optimizer = optim.Adam(shared_dqn.parameters(), lr=1e-4)
    replay_buffer = PrioritizedReplayBuffer(10000)

    epsilon = epsilon_start
    beta = beta_start

    total_rewards = []
    mean_rewards = []

    plt.ion()
    plt.figure(figsize=(10, 5))

    for episode in range(num_episodes):
        states = env.reset()
        episode_rewards = {agent: 0 for agent in env.agents}
        done = {agent: False for agent in env.agents}

        while not all(done.values()):
            actions = {}
            for agent in env.agents:
                if not done[agent]:
                    actions[agent] = select_action(states[agent], epsilon, action_size, shared_dqn)

            #env step
            next_states, rewards, dones, _ = env.step(actions)

            #add episode to replay buffer
            for agent in env.agents:
                replay_buffer.add(states[agent], actions[agent], rewards[agent], next_states[agent], dones[agent])
                episode_rewards[agent] += rewards[agent]

            # Update DDQN
            update_ddqn(shared_dqn, target_dqn, optimizer, replay_buffer, batch_size, gamma, beta)

            states = next_states
            done = dones

        total_reward = sum(episode_rewards.values())
        total_rewards.append(total_reward)

        # plt.close('all')
        # plot_rewards(total_rewards, mean_rewards)

        #calculate mean rewards for every 100 episodes
        if (episode + 1) % 100 == 0:
            mean_reward = np.mean(total_rewards[-100:])
            mean_rewards.append(mean_reward)
        else:
            mean_rewards.append(mean_rewards[-1] if mean_rewards else 0)

        #decay epsilon and update beta
        epsilon = max(epsilon_end, epsilon * epsilon_decay)
        beta = min(beta_end, beta + beta_increment_per_episode)

        #update target dqn
        if episode % target_update_freq == 0:
            update_target(shared_dqn, target_dqn)

        print(f'Episode {episode + 1}: Total reward = {total_reward}')

    plt.ioff()
    plt.show()

    return shared_dqn


In [12]:
env = ParallelRadarJammingEnv(hop_states, jamming_bandwidths)
train_ddqn_per(env, num_episodes=15000, 
                batch_size=128,
                gamma=0.90, 
                epsilon_start=1, 
                epsilon_end=0.05, 
                epsilon_decay=0.999,
                beta_start=0.4,
                beta_end=1.0, 
                beta_increment_per_episode=0.00005, 
                target_update_freq=20)




  self.priorities[idx] = priority


Episode 1: Total reward = -28330
Episode 2: Total reward = -22940
Episode 3: Total reward = -24380


KeyboardInterrupt: 

<Figure size 1000x500 with 0 Axes>